7 1 6 1 5 1 8 8 1 1 1 1 8 1 4 3 3 3 10 1 1 1 1 1 8 2 2 2 2 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 | // SPDX-License-Identifier: GPL-2.0 /* net/sched/sch_etf.c Earliest TxTime First queueing discipline. * * Authors: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com> * Vinicius Costa Gomes <vinicius.gomes@intel.com> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/rbtree.h> #include <linux/skbuff.h> #include <linux/posix-timers.h> #include <net/netlink.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/sock.h> #define DEADLINE_MODE_IS_ON(x) ((x)->flags & TC_ETF_DEADLINE_MODE_ON) #define OFFLOAD_IS_ON(x) ((x)->flags & TC_ETF_OFFLOAD_ON) struct etf_sched_data { bool offload; bool deadline_mode; int clockid; int queue; s32 delta; /* in ns */ ktime_t last; /* The txtime of the last skb sent to the netdevice. */ struct rb_root head; struct qdisc_watchdog watchdog; ktime_t (*get_time)(void); }; static const struct nla_policy etf_policy[TCA_ETF_MAX + 1] = { [TCA_ETF_PARMS] = { .len = sizeof(struct tc_etf_qopt) }, }; static inline int validate_input_params(struct tc_etf_qopt *qopt, struct netlink_ext_ack *extack) { /* Check if params comply to the following rules: * * Clockid and delta must be valid. * * * Dynamic clockids are not supported. * * * Delta must be a positive integer. * * Also note that for the HW offload case, we must * expect that system clocks have been synchronized to PHC. */ if (qopt->clockid < 0) { NL_SET_ERR_MSG(extack, "Dynamic clockids are not supported"); return -ENOTSUPP; } if (qopt->clockid != CLOCK_TAI) { NL_SET_ERR_MSG(extack, "Invalid clockid. CLOCK_TAI must be used"); return -EINVAL; } if (qopt->delta < 0) { NL_SET_ERR_MSG(extack, "Delta must be positive"); return -EINVAL; } return 0; } static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) { struct etf_sched_data *q = qdisc_priv(sch); ktime_t txtime = nskb->tstamp; struct sock *sk = nskb->sk; ktime_t now; if (!sk || !sk_fullsock(sk)) return false; if (!sock_flag(sk, SOCK_TXTIME)) return false; /* We don't perform crosstimestamping. * Drop if packet's clockid differs from qdisc's. */ if (sk->sk_clockid != q->clockid) return false; if (sk->sk_txtime_deadline_mode != q->deadline_mode) return false; now = q->get_time(); if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) return false; return true; } static struct sk_buff *etf_peek_timesortedlist(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node *p; p = rb_first(&q->head); if (!p) return NULL; return rb_to_skb(p); } static void reset_watchdog(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = etf_peek_timesortedlist(sch); ktime_t next; if (!skb) return; next = ktime_sub_ns(skb->tstamp, q->delta); qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); } static void report_sock_error(struct sk_buff *skb, u32 err, u8 code) { struct sock_exterr_skb *serr; struct sk_buff *clone; ktime_t txtime = skb->tstamp; struct sock *sk = skb->sk; if (!sk || !sk_fullsock(sk) || !(sk->sk_txtime_report_errors)) return; clone = skb_clone(skb, GFP_ATOMIC); if (!clone) return; serr = SKB_EXT_ERR(clone); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_TXTIME; serr->ee.ee_type = 0; serr->ee.ee_code = code; serr->ee.ee_pad = 0; serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */ serr->ee.ee_info = txtime; /* low part of tstamp */ if (sock_queue_err_skb(sk, clone)) kfree_skb(clone); } static int etf_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, struct sk_buff **to_free) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node **p = &q->head.rb_node, *parent = NULL; ktime_t txtime = nskb->tstamp; if (!is_packet_valid(sch, nskb)) { report_sock_error(nskb, EINVAL, SO_EE_CODE_TXTIME_INVALID_PARAM); return qdisc_drop(nskb, sch, to_free); } while (*p) { struct sk_buff *skb; parent = *p; skb = rb_to_skb(parent); if (ktime_after(txtime, skb->tstamp)) p = &parent->rb_right; else p = &parent->rb_left; } rb_link_node(&nskb->rbnode, parent, p); rb_insert_color(&nskb->rbnode, &q->head); qdisc_qstats_backlog_inc(sch, nskb); sch->q.qlen++; /* Now we may need to re-arm the qdisc watchdog for the next packet. */ reset_watchdog(sch); return NET_XMIT_SUCCESS; } static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, bool drop) { struct etf_sched_data *q = qdisc_priv(sch); rb_erase(&skb->rbnode, &q->head); /* The rbnode field in the skb re-uses these fields, now that * we are done with the rbnode, reset them. */ skb->next = NULL; skb->prev = NULL; skb->dev = qdisc_dev(sch); qdisc_qstats_backlog_dec(sch, skb); if (drop) { struct sk_buff *to_free = NULL; report_sock_error(skb, ECANCELED, SO_EE_CODE_TXTIME_MISSED); qdisc_drop(skb, sch, &to_free); kfree_skb_list(to_free); qdisc_qstats_overlimit(sch); } else { qdisc_bstats_update(sch, skb); q->last = skb->tstamp; } sch->q.qlen--; } static struct sk_buff *etf_dequeue_timesortedlist(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; ktime_t now, next; skb = etf_peek_timesortedlist(sch); if (!skb) return NULL; now = q->get_time(); /* Drop if packet has expired while in queue. */ if (ktime_before(skb->tstamp, now)) { timesortedlist_erase(sch, skb, true); skb = NULL; goto out; } /* When in deadline mode, dequeue as soon as possible and change the * txtime from deadline to (now + delta). */ if (q->deadline_mode) { timesortedlist_erase(sch, skb, false); skb->tstamp = now; goto out; } next = ktime_sub_ns(skb->tstamp, q->delta); /* Dequeue only if now is within the [txtime - delta, txtime] range. */ if (ktime_after(now, next)) timesortedlist_erase(sch, skb, false); else skb = NULL; out: /* Now we may need to re-arm the qdisc watchdog for the next packet. */ reset_watchdog(sch); return skb; } static void etf_disable_offload(struct net_device *dev, struct etf_sched_data *q) { struct tc_etf_qopt_offload etf = { }; const struct net_device_ops *ops; int err; if (!q->offload) return; ops = dev->netdev_ops; if (!ops->ndo_setup_tc) return; etf.queue = q->queue; etf.enable = 0; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); if (err < 0) pr_warn("Couldn't disable ETF offload for queue %d\n", etf.queue); } static int etf_enable_offload(struct net_device *dev, struct etf_sched_data *q, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_etf_qopt_offload etf = { }; int err; if (q->offload) return 0; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Specified device does not support ETF offload"); return -EOPNOTSUPP; } etf.queue = q->queue; etf.enable = 1; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_ETF, &etf); if (err < 0) { NL_SET_ERR_MSG(extack, "Specified device failed to setup ETF hardware offload"); return err; } return 0; } static int etf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct etf_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct nlattr *tb[TCA_ETF_MAX + 1]; struct tc_etf_qopt *qopt; int err; if (!opt) { NL_SET_ERR_MSG(extack, "Missing ETF qdisc options which are mandatory"); return -EINVAL; } err = nla_parse_nested(tb, TCA_ETF_MAX, opt, etf_policy, extack); if (err < 0) return err; if (!tb[TCA_ETF_PARMS]) { NL_SET_ERR_MSG(extack, "Missing mandatory ETF parameters"); return -EINVAL; } qopt = nla_data(tb[TCA_ETF_PARMS]); pr_debug("delta %d clockid %d offload %s deadline %s\n", qopt->delta, qopt->clockid, OFFLOAD_IS_ON(qopt) ? "on" : "off", DEADLINE_MODE_IS_ON(qopt) ? "on" : "off"); err = validate_input_params(qopt, extack); if (err < 0) return err; q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); if (OFFLOAD_IS_ON(qopt)) { err = etf_enable_offload(dev, q, extack); if (err < 0) return err; } /* Everything went OK, save the parameters used. */ q->delta = qopt->delta; q->clockid = qopt->clockid; q->offload = OFFLOAD_IS_ON(qopt); q->deadline_mode = DEADLINE_MODE_IS_ON(qopt); switch (q->clockid) { case CLOCK_REALTIME: q->get_time = ktime_get_real; break; case CLOCK_MONOTONIC: q->get_time = ktime_get; break; case CLOCK_BOOTTIME: q->get_time = ktime_get_boottime; break; case CLOCK_TAI: q->get_time = ktime_get_clocktai; break; default: NL_SET_ERR_MSG(extack, "Clockid is not supported"); return -ENOTSUPP; } qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid); return 0; } static void timesortedlist_clear(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct rb_node *p = rb_first(&q->head); while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); rb_erase(&skb->rbnode, &q->head); rtnl_kfree_skbs(skb, skb); sch->q.qlen--; } } static void etf_reset(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); /* Only cancel watchdog if it's been initialized. */ if (q->watchdog.qdisc == sch) qdisc_watchdog_cancel(&q->watchdog); /* No matter which mode we are on, it's safe to clear both lists. */ timesortedlist_clear(sch); __qdisc_reset_queue(&sch->q); sch->qstats.backlog = 0; sch->q.qlen = 0; q->last = 0; } static void etf_destroy(struct Qdisc *sch) { struct etf_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); /* Only cancel watchdog if it's been initialized. */ if (q->watchdog.qdisc == sch) qdisc_watchdog_cancel(&q->watchdog); etf_disable_offload(dev, q); } static int etf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct etf_sched_data *q = qdisc_priv(sch); struct tc_etf_qopt opt = { }; struct nlattr *nest; nest = nla_nest_start(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; opt.delta = q->delta; opt.clockid = q->clockid; if (q->offload) opt.flags |= TC_ETF_OFFLOAD_ON; if (q->deadline_mode) opt.flags |= TC_ETF_DEADLINE_MODE_ON; if (nla_put(skb, TCA_ETF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static struct Qdisc_ops etf_qdisc_ops __read_mostly = { .id = "etf", .priv_size = sizeof(struct etf_sched_data), .enqueue = etf_enqueue_timesortedlist, .dequeue = etf_dequeue_timesortedlist, .peek = etf_peek_timesortedlist, .init = etf_init, .reset = etf_reset, .destroy = etf_destroy, .dump = etf_dump, .owner = THIS_MODULE, }; static int __init etf_module_init(void) { return register_qdisc(&etf_qdisc_ops); } static void __exit etf_module_exit(void) { unregister_qdisc(&etf_qdisc_ops); } module_init(etf_module_init) module_exit(etf_module_exit) MODULE_LICENSE("GPL"); |
8 6 7 7 6 7 1 1 1 1 1 1 1 7 7 7 7 7 7 7 7 7 5 7 7 7 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 | /* * dir.c * * PURPOSE * Directory handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: * ftp://prep.ai.mit.edu/pub/gnu/GPL * Each contributing author retains all rights to their own work. * * (C) 1998-2004 Ben Fennema * * HISTORY * * 10/05/98 dgb Split directory operations into its own file * Implemented directory reads via do_udf_readdir * 10/06/98 Made directory operations work! * 11/17/98 Rewrote directory to support ICBTAG_FLAG_AD_LONG * 11/25/98 blf Rewrote directory handling (readdir+lookup) to support reading * across blocks. * 12/12/98 Split out the lookup code to namei.c. bulk of directory * code now in directory.c:udf_fileident_read. */ #include "udfdecl.h" #include <linux/string.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/bio.h> #include <linux/iversion.h> #include "udf_i.h" #include "udf_sb.h" static int udf_readdir(struct file *file, struct dir_context *ctx) { struct inode *dir = file_inode(file); struct udf_inode_info *iinfo = UDF_I(dir); struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL}; struct fileIdentDesc *fi = NULL; struct fileIdentDesc cfi; udf_pblk_t block, iblock; loff_t nf_pos, emit_pos = 0; int flen; unsigned char *fname = NULL, *copy_name = NULL; unsigned char *nameptr; uint16_t liu; uint8_t lfi; loff_t size = udf_ext0_offset(dir) + dir->i_size; struct buffer_head *tmp, *bha[16]; struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; int i, num, ret = 0; struct extent_position epos = { NULL, 0, {0, 0} }; struct super_block *sb = dir->i_sb; bool pos_valid = false; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) return 0; ctx->pos = 1; } nf_pos = (ctx->pos - 1) << 2; if (nf_pos >= size) goto out; /* * Something changed since last readdir (either lseek was called or dir * changed)? We need to verify the position correctly points at the * beginning of some dir entry so that the directory parsing code does * not get confused. Since UDF does not have any reliable way of * identifying beginning of dir entry (names are under user control), * we need to scan the directory from the beginning. */ if (!inode_eq_iversion(dir, file->f_version)) { emit_pos = nf_pos; nf_pos = 0; } else { pos_valid = true; } fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); if (!fname) { ret = -ENOMEM; goto out; } if (nf_pos == 0) nf_pos = udf_ext0_offset(dir); fibh.soffset = fibh.eoffset = nf_pos & (sb->s_blocksize - 1); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { if (inode_bmap(dir, nf_pos >> sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { ret = -ENOENT; goto out; } block = udf_get_lb_pblock(sb, &eloc, offset); if ((++offset << sb->s_blocksize_bits) < elen) { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) epos.offset -= sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) epos.offset -= sizeof(struct long_ad); } else { offset = 0; } if (!(fibh.sbh = fibh.ebh = udf_tread(sb, block))) { ret = -EIO; goto out; } if (!(offset & ((16 >> (sb->s_blocksize_bits - 9)) - 1))) { i = 16 >> (sb->s_blocksize_bits - 9); if (i + offset > (elen >> sb->s_blocksize_bits)) i = (elen >> sb->s_blocksize_bits) - offset; for (num = 0; i > 0; i--) { block = udf_get_lb_pblock(sb, &eloc, offset + i); tmp = udf_tgetblk(sb, block); if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) bha[num++] = tmp; else brelse(tmp); } if (num) { ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha); for (i = 0; i < num; i++) brelse(bha[i]); } } } while (nf_pos < size) { struct kernel_lb_addr tloc; loff_t cur_pos = nf_pos; /* Update file position only if we got past the current one */ if (nf_pos >= emit_pos) { ctx->pos = (nf_pos >> 2) + 1; pos_valid = true; } fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, &elen, &offset); if (!fi) goto out; /* Still not at offset where user asked us to read from? */ if (cur_pos < emit_pos) continue; liu = le16_to_cpu(cfi.lengthOfImpUse); lfi = cfi.lengthFileIdent; if (fibh.sbh == fibh.ebh) { nameptr = fi->fileIdent + liu; } else { int poffset; /* Unpaded ending offset */ poffset = fibh.soffset + sizeof(struct fileIdentDesc) + liu + lfi; if (poffset >= lfi) { nameptr = (char *)(fibh.ebh->b_data + poffset - lfi); } else { if (!copy_name) { copy_name = kmalloc(UDF_NAME_LEN, GFP_NOFS); if (!copy_name) { ret = -ENOMEM; goto out; } } nameptr = copy_name; memcpy(nameptr, fi->fileIdent + liu, lfi - poffset); memcpy(nameptr + lfi - poffset, fibh.ebh->b_data, poffset); } } if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) continue; } if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) continue; } if (cfi.fileCharacteristics & FID_FILE_CHAR_PARENT) { if (!dir_emit_dotdot(file, ctx)) goto out; continue; } flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); if (flen < 0) continue; tloc = lelb_to_cpu(cfi.icb.extLocation); iblock = udf_get_lb_pblock(sb, &tloc, 0); if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN)) goto out; } /* end while */ ctx->pos = (nf_pos >> 2) + 1; pos_valid = true; out: if (pos_valid) file->f_version = inode_query_iversion(dir); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); brelse(epos.bh); kfree(fname); kfree(copy_name); return ret; } /* readdir and lookup functions */ const struct file_operations udf_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = udf_readdir, .unlocked_ioctl = udf_ioctl, .fsync = generic_file_fsync, }; |
10 10 10 10 4 4 10 10 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 | /* * IPVS: Destination Hashing scheduling module * * Authors: Wensong Zhang <wensong@gnuchina.org> * * Inspired by the consistent hashing scheduler patch from * Thomas Proell <proellt@gmx.de> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Changes: * */ /* * The dh algorithm is to select server by the hash key of destination IP * address. The pseudo code is as follows: * * n <- servernode[dest_ip]; * if (n is dead) OR * (n is overloaded) OR (n.weight <= 0) then * return NULL; * * return n; * * Notes that servernode is a 256-bucket hash table that maps the hash * index derived from packet destination IP address to the current server * array. If the dh scheduler is used in cache cluster, it is good to * combine it with cache_bypass feature. When the statically assigned * server is dead or overloaded, the load balancer can bypass the cache * server and send requests to the original server directly. * */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/ip.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/hash.h> #include <net/ip_vs.h> /* * IPVS DH bucket */ struct ip_vs_dh_bucket { struct ip_vs_dest __rcu *dest; /* real server (cache) */ }; /* * for IPVS DH entry hash table */ #ifndef CONFIG_IP_VS_DH_TAB_BITS #define CONFIG_IP_VS_DH_TAB_BITS 8 #endif #define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS #define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) #define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) struct ip_vs_dh_state { struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE]; struct rcu_head rcu_head; }; /* * Returns hash value for IPVS DH entry */ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr) { __be32 addr_fold = addr->ip; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS); } /* * Get ip_vs_dest associated with supplied parameters. */ static inline struct ip_vs_dest * ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr) { return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest); } /* * Assign all the hash buckets of the specified table with the service. */ static int ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc) { int i; struct ip_vs_dh_bucket *b; struct list_head *p; struct ip_vs_dest *dest; bool empty; b = &s->buckets[0]; p = &svc->destinations; empty = list_empty(p); for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { dest = rcu_dereference_protected(b->dest, 1); if (dest) ip_vs_dest_put(dest); if (empty) RCU_INIT_POINTER(b->dest, NULL); else { if (p == &svc->destinations) p = p->next; dest = list_entry(p, struct ip_vs_dest, n_list); ip_vs_dest_hold(dest); RCU_INIT_POINTER(b->dest, dest); p = p->next; } b++; } return 0; } /* * Flush all the hash buckets of the specified table. */ static void ip_vs_dh_flush(struct ip_vs_dh_state *s) { int i; struct ip_vs_dh_bucket *b; struct ip_vs_dest *dest; b = &s->buckets[0]; for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { dest = rcu_dereference_protected(b->dest, 1); if (dest) { ip_vs_dest_put(dest); RCU_INIT_POINTER(b->dest, NULL); } b++; } } static int ip_vs_dh_init_svc(struct ip_vs_service *svc) { struct ip_vs_dh_state *s; /* allocate the DH table for this service */ s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL); if (s == NULL) return -ENOMEM; svc->sched_data = s; IP_VS_DBG(6, "DH hash table (memory=%zdbytes) allocated for " "current service\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); /* assign the hash buckets with current dests */ ip_vs_dh_reassign(s, svc); return 0; } static void ip_vs_dh_done_svc(struct ip_vs_service *svc) { struct ip_vs_dh_state *s = svc->sched_data; /* got to clean up hash buckets here */ ip_vs_dh_flush(s); /* release the table itself */ kfree_rcu(s, rcu_head); IP_VS_DBG(6, "DH hash table (memory=%zdbytes) released\n", sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); } static int ip_vs_dh_dest_changed(struct ip_vs_service *svc, struct ip_vs_dest *dest) { struct ip_vs_dh_state *s = svc->sched_data; /* assign the hash buckets with the updated service */ ip_vs_dh_reassign(s, svc); return 0; } /* * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, * consider that the server is overloaded here. */ static inline int is_overloaded(struct ip_vs_dest *dest) { return dest->flags & IP_VS_DEST_F_OVERLOAD; } /* * Destination hashing scheduling */ static struct ip_vs_dest * ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest; struct ip_vs_dh_state *s; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); s = (struct ip_vs_dh_state *) svc->sched_data; dest = ip_vs_dh_get(svc->af, s, &iph->daddr); if (!dest || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 || is_overloaded(dest)) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n", IP_VS_DBG_ADDR(svc->af, &iph->daddr), IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port)); return dest; } /* * IPVS DH Scheduler structure */ static struct ip_vs_scheduler ip_vs_dh_scheduler = { .name = "dh", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), .init_service = ip_vs_dh_init_svc, .done_service = ip_vs_dh_done_svc, .add_dest = ip_vs_dh_dest_changed, .del_dest = ip_vs_dh_dest_changed, .schedule = ip_vs_dh_schedule, }; static int __init ip_vs_dh_init(void) { return register_ip_vs_scheduler(&ip_vs_dh_scheduler); } static void __exit ip_vs_dh_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); synchronize_rcu(); } module_init(ip_vs_dh_init); module_exit(ip_vs_dh_cleanup); MODULE_LICENSE("GPL"); |
12 1767 236 220 219 1769 550 1770 1769 5 1771 1766 1770 1768 1771 1768 1769 1770 1769 1770 1768 213 213 215 213 212 212 210 199 2 2 5 177 5 201 201 134 104 132 170 170 85 212 135 149 148 103 187 146 185 206 216 239 238 237 236 232 232 230 228 230 215 232 98 96 73 96 242 242 242 241 240 231 233 13 219 218 215 213 212 110 185 185 89 30 235 17 15 15 15 15 17 5 3 2 2 7 13 2 11 10 10 10 11 13 234 234 232 231 230 13 13 12 223 227 7 6 4 6 7 244 33 35 10 1 10 1 8 2 6 12 12 12 5 12 2 2 290 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 | /* * Packet matching code. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org> * Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/capability.h> #include <linux/in.h> #include <linux/skbuff.h> #include <linux/kmod.h> #include <linux/vmalloc.h> #include <linux/netdevice.h> #include <linux/module.h> #include <linux/poison.h> #include <linux/icmpv6.h> #include <net/ipv6.h> #include <net/compat.h> #include <linux/uaccess.h> #include <linux/mutex.h> #include <linux/proc_fs.h> #include <linux/err.h> #include <linux/cpumask.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> #include "../../netfilter/xt_repldata.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv6 packet filter"); MODULE_ALIAS("ip6t_icmp6"); void *ip6t_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ip6t, IP6T); } EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table); /* Returns whether matches rule or not. */ /* Performance critical - called for every packet */ static inline bool ip6_packet_match(const struct sk_buff *skb, const char *indev, const char *outdev, const struct ip6t_ip6 *ip6info, unsigned int *protoff, int *fragoff, bool *hotdrop) { unsigned long ret; const struct ipv6hdr *ipv6 = ipv6_hdr(skb); if (NF_INVF(ip6info, IP6T_INV_SRCIP, ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, &ip6info->src)) || NF_INVF(ip6info, IP6T_INV_DSTIP, ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, &ip6info->dst))) return false; ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask); if (NF_INVF(ip6info, IP6T_INV_VIA_IN, ret != 0)) return false; ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask); if (NF_INVF(ip6info, IP6T_INV_VIA_OUT, ret != 0)) return false; /* ... might want to do something with class and flowlabel here ... */ /* look for the desired protocol header */ if (ip6info->flags & IP6T_F_PROTO) { int protohdr; unsigned short _frag_off; protohdr = ipv6_find_hdr(skb, protoff, -1, &_frag_off, NULL); if (protohdr < 0) { if (_frag_off == 0) *hotdrop = true; return false; } *fragoff = _frag_off; if (ip6info->proto == protohdr) { if (ip6info->invflags & IP6T_INV_PROTO) return false; return true; } /* We need match for the '-p all', too! */ if ((ip6info->proto != 0) && !(ip6info->invflags & IP6T_INV_PROTO)) return false; } return true; } /* should be ip6 safe */ static bool ip6_checkentry(const struct ip6t_ip6 *ipv6) { if (ipv6->flags & ~IP6T_F_MASK) return false; if (ipv6->invflags & ~IP6T_INV_MASK) return false; return true; } static unsigned int ip6t_error(struct sk_buff *skb, const struct xt_action_param *par) { net_info_ratelimited("error: `%s'\n", (const char *)par->targinfo); return NF_DROP; } static inline struct ip6t_entry * get_entry(const void *base, unsigned int offset) { return (struct ip6t_entry *)(base + offset); } /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ static inline bool unconditional(const struct ip6t_entry *e) { static const struct ip6t_ip6 uncond; return e->target_offset == sizeof(struct ip6t_entry) && memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0; } static inline const struct xt_entry_target * ip6t_get_target_c(const struct ip6t_entry *e) { return ip6t_get_target((struct ip6t_entry *)e); } #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* This cries for unification! */ static const char *const hooknames[] = { [NF_INET_PRE_ROUTING] = "PREROUTING", [NF_INET_LOCAL_IN] = "INPUT", [NF_INET_FORWARD] = "FORWARD", [NF_INET_LOCAL_OUT] = "OUTPUT", [NF_INET_POST_ROUTING] = "POSTROUTING", }; enum nf_ip_trace_comments { NF_IP6_TRACE_COMMENT_RULE, NF_IP6_TRACE_COMMENT_RETURN, NF_IP6_TRACE_COMMENT_POLICY, }; static const char *const comments[] = { [NF_IP6_TRACE_COMMENT_RULE] = "rule", [NF_IP6_TRACE_COMMENT_RETURN] = "return", [NF_IP6_TRACE_COMMENT_POLICY] = "policy", }; static const struct nf_loginfo trace_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { .level = LOGLEVEL_WARNING, .logflags = NF_LOG_DEFAULT_MASK, }, }, }; /* Mildly perf critical (only if packet tracing is on) */ static inline int get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, const char *hookname, const char **chainname, const char **comment, unsigned int *rulenum) { const struct xt_standard_target *t = (void *)ip6t_get_target_c(s); if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ *chainname = t->target.data; (*rulenum) = 0; } else if (s == e) { (*rulenum)++; if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP6_TRACE_COMMENT_POLICY] : comments[NF_IP6_TRACE_COMMENT_RETURN]; } return 1; } else (*rulenum)++; return 0; } static void trace_packet(struct net *net, const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, const char *tablename, const struct xt_table_info *private, const struct ip6t_entry *e) { const struct ip6t_entry *root; const char *hookname, *chainname, *comment; const struct ip6t_entry *iter; unsigned int rulenum = 0; root = get_entry(private->entries, private->hook_entry[hook]); hookname = chainname = hooknames[hook]; comment = comments[NF_IP6_TRACE_COMMENT_RULE]; xt_entry_foreach(iter, root, private->size - private->hook_entry[hook]) if (get_chainname_rulenum(iter, e, hookname, &chainname, &comment, &rulenum) != 0) break; nf_log_trace(net, AF_INET6, hook, skb, in, out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", tablename, chainname, comment, rulenum); } #endif static inline struct ip6t_entry * ip6t_next_entry(const struct ip6t_entry *entry) { return (void *)entry + entry->next_offset; } /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int ip6t_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table) { unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; const char *indev, *outdev; const void *table_base; struct ip6t_entry *e, **jumpstack; unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ stackidx = 0; indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; /* We handle fragments by dealing with the first fragment as * if it was a normal packet. All other fragments are treated * normally, except that they will NEVER match rules that ask * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ acpar.fragoff = 0; acpar.hotdrop = false; acpar.state = state; WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); addend = xt_write_recseq_begin(); private = READ_ONCE(table->private); /* Address dependency. */ cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; /* Switch to alternate jumpstack if we're being invoked via TEE. * TEE issues XT_CONTINUE verdict on original skb so we must not * clobber the jumpstack. * * For recursion via REJECT or SYNPROXY the stack will be clobbered * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; struct xt_counters *counter; WARN_ON(!e); acpar.thoff = 0; if (!ip6_packet_match(skb, indev, outdev, &e->ipv6, &acpar.thoff, &acpar.fragoff, &acpar.hotdrop)) { no_match: e = ip6t_next_entry(e); continue; } xt_ematch_foreach(ematch, e) { acpar.match = ematch->u.kernel.match; acpar.matchinfo = ematch->data; if (!acpar.match->match(skb, &acpar)) goto no_match; } counter = xt_get_this_cpu_counter(&e->counters); ADD_COUNTER(*counter, skb->len, 1); t = ip6t_get_target_c(e); WARN_ON(!t->u.kernel.target); #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) /* The packet is traced: log it */ if (unlikely(skb->nf_trace)) trace_packet(state->net, skb, hook, state->in, state->out, table->name, private, e); #endif /* Standard target? */ if (!t->u.kernel.target->target) { int v; v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ if (v != XT_RETURN) { verdict = (unsigned int)(-v) - 1; break; } if (stackidx == 0) e = get_entry(table_base, private->underflow[hook]); else e = ip6t_next_entry(jumpstack[--stackidx]); continue; } if (table_base + v != ip6t_next_entry(e) && !(e->ipv6.flags & IP6T_F_GOTO)) { if (unlikely(stackidx >= private->stacksize)) { verdict = NF_DROP; break; } jumpstack[stackidx++] = e; } e = get_entry(table_base, v); continue; } acpar.target = t->u.kernel.target; acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); if (verdict == XT_CONTINUE) e = ip6t_next_entry(e); else /* Verdict */ break; } while (!acpar.hotdrop); xt_write_recseq_end(addend); local_bh_enable(); if (acpar.hotdrop) return NF_DROP; else return verdict; } /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0, unsigned int *offsets) { unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset to 0 as we leave), and comefrom to save source hook bitmask */ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ip6t_entry *e = entry0 + pos; if (!(valid_hooks & (1 << hook))) continue; /* Set initial back pointer. */ e->counters.pcnt = pos; for (;;) { const struct xt_standard_target *t = (void *)ip6t_get_target_c(e); int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_INET_NUMHOOKS)) return 0; e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && t->verdict < 0) || visited) { unsigned int oldpos, size; /* Return: backtrack through the last big jump. */ do { e->comefrom ^= (1<<NF_INET_NUMHOOKS); oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; /* We're at the start. */ if (pos == oldpos) goto next; e = entry0 + pos; } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = entry0 + pos + size; if (pos + size >= newinfo->size) return 0; e->counters.pcnt = pos; pos += size; } else { int newpos = t->verdict; if (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0 && newpos >= 0) { /* This a jump; chase it. */ if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; if (newpos >= newinfo->size) return 0; } e = entry0 + newpos; e->counters.pcnt = pos; pos = newpos; } } next: ; } return 1; } static void cleanup_match(struct xt_entry_match *m, struct net *net) { struct xt_mtdtor_param par; par.net = net; par.match = m->u.kernel.match; par.matchinfo = m->data; par.family = NFPROTO_IPV6; if (par.match->destroy != NULL) par.match->destroy(&par); module_put(par.match->me); } static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ip6t_ip6 *ipv6 = par->entryinfo; par->match = m->u.kernel.match; par->matchinfo = m->data; return xt_check_match(par, m->u.match_size - sizeof(*m), ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); } static int find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { struct xt_match *match; int ret; match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); if (IS_ERR(match)) return PTR_ERR(match); m->u.kernel.match = match; ret = check_match(m, par); if (ret) goto err; return 0; err: module_put(m->u.kernel.match->me); return ret; } static int check_target(struct ip6t_entry *e, struct net *net, const char *name) { struct xt_entry_target *t = ip6t_get_target(e); struct xt_tgchk_param par = { .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, .targinfo = t->data, .hook_mask = e->comefrom, .family = NFPROTO_IPV6, }; return xt_check_target(&par, t->u.target_size - sizeof(*t), e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); } static int find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, unsigned int size, struct xt_percpu_counter_alloc_state *alloc_state) { struct xt_entry_target *t; struct xt_target *target; int ret; unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; if (!xt_percpu_counter_alloc(alloc_state, &e->counters)) return -ENOMEM; j = 0; memset(&mtpar, 0, sizeof(mtpar)); mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ipv6; mtpar.hook_mask = e->comefrom; mtpar.family = NFPROTO_IPV6; xt_ematch_foreach(ematch, e) { ret = find_check_match(ematch, &mtpar); if (ret != 0) goto cleanup_matches; ++j; } t = ip6t_get_target(e); target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { ret = PTR_ERR(target); goto cleanup_matches; } t->u.kernel.target = target; ret = check_target(e, net, name); if (ret) goto err; return 0; err: module_put(t->u.kernel.target->me); cleanup_matches: xt_ematch_foreach(ematch, e) { if (j-- == 0) break; cleanup_match(ematch, net); } xt_percpu_counter_free(&e->counters); return ret; } static bool check_underflow(const struct ip6t_entry *e) { const struct xt_entry_target *t; unsigned int verdict; if (!unconditional(e)) return false; t = ip6t_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; verdict = ((struct xt_standard_target *)t)->verdict; verdict = -verdict - 1; return verdict == NF_DROP || verdict == NF_ACCEPT; } static int check_entry_size_and_hooks(struct ip6t_entry *e, struct xt_table_info *newinfo, const unsigned char *base, const unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, unsigned int valid_hooks) { unsigned int h; int err; if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit || (unsigned char *)e + e->next_offset > limit) return -EINVAL; if (e->next_offset < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) return -EINVAL; if (!ip6_checkentry(&e->ipv6)) return -EINVAL; err = xt_check_entry_offsets(e, e->elems, e->target_offset, e->next_offset); if (err) return err; /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) continue; if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) return -EINVAL; newinfo->underflow[h] = underflows[h]; } } /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; return 0; } static void cleanup_entry(struct ip6t_entry *e, struct net *net) { struct xt_tgdtor_param par; struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ xt_ematch_foreach(ematch, e) cleanup_match(ematch, net); t = ip6t_get_target(e); par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_IPV6; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); xt_percpu_counter_free(&e->counters); } /* Checks and translates the user-supplied table segment (held in newinfo) */ static int translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, const struct ip6t_replace *repl) { struct xt_percpu_counter_alloc_state alloc_state = { 0 }; struct ip6t_entry *iter; unsigned int *offsets; unsigned int i; int ret = 0; newinfo->size = repl->size; newinfo->number = repl->num_entries; /* Init all hooks to impossible value. */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = 0xFFFFFFFF; newinfo->underflow[i] = 0xFFFFFFFF; } offsets = xt_alloc_entry_offsets(newinfo->number); if (!offsets) return -ENOMEM; i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { ret = check_entry_size_and_hooks(iter, newinfo, entry0, entry0 + repl->size, repl->hook_entry, repl->underflow, repl->valid_hooks); if (ret != 0) goto out_free; if (i < repl->num_entries) offsets[i] = (void *)iter - entry0; ++i; if (strcmp(ip6t_get_target(iter)->u.user.name, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } ret = -EINVAL; if (i != repl->num_entries) goto out_free; ret = xt_check_table_hooks(newinfo, repl->valid_hooks); if (ret) goto out_free; if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) { ret = -ELOOP; goto out_free; } kvfree(offsets); /* Finally, each sanity check must pass */ i = 0; xt_entry_foreach(iter, entry0, newinfo->size) { ret = find_check_entry(iter, net, repl->name, repl->size, &alloc_state); if (ret != 0) break; ++i; } if (ret != 0) { xt_entry_foreach(iter, entry0, newinfo->size) { if (i-- == 0) break; cleanup_entry(iter, net); } return ret; } return ret; out_free: kvfree(offsets); return ret; } static void get_counters(const struct xt_table_info *t, struct xt_counters counters[]) { struct ip6t_entry *iter; unsigned int cpu; unsigned int i; for_each_possible_cpu(cpu) { seqcount_t *s = &per_cpu(xt_recseq, cpu); i = 0; xt_entry_foreach(iter, t->entries, t->size) { struct xt_counters *tmp; u64 bcnt, pcnt; unsigned int start; tmp = xt_get_per_cpu_counter(&iter->counters, cpu); do { start = read_seqcount_begin(s); bcnt = tmp->bcnt; pcnt = tmp->pcnt; } while (read_seqcount_retry(s, start)); ADD_COUNTER(counters[i], bcnt, pcnt); ++i; cond_resched(); } } } static void get_old_counters(const struct xt_table_info *t, struct xt_counters counters[]) { struct ip6t_entry *iter; unsigned int cpu, i; for_each_possible_cpu(cpu) { i = 0; xt_entry_foreach(iter, t->entries, t->size) { const struct xt_counters *tmp; tmp = xt_get_per_cpu_counter(&iter->counters, cpu); ADD_COUNTER(counters[i], tmp->bcnt, tmp->pcnt); ++i; } cond_resched(); } } static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct xt_counters) * private->number; counters = vzalloc(countersize); if (counters == NULL) return ERR_PTR(-ENOMEM); get_counters(private, counters); return counters; } static int copy_entries_to_user(unsigned int total_size, const struct xt_table *table, void __user *userptr) { unsigned int off, num; const struct ip6t_entry *e; struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; const void *loc_cpu_entry; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); loc_cpu_entry = private->entries; /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ unsigned int i; const struct xt_entry_match *m; const struct xt_entry_target *t; e = loc_cpu_entry + off; if (copy_to_user(userptr + off, e, sizeof(*e))) { ret = -EFAULT; goto free_counters; } if (copy_to_user(userptr + off + offsetof(struct ip6t_entry, counters), &counters[num], sizeof(counters[num])) != 0) { ret = -EFAULT; goto free_counters; } for (i = sizeof(struct ip6t_entry); i < e->target_offset; i += m->u.match_size) { m = (void *)e + i; if (xt_match_to_user(m, userptr + off + i)) { ret = -EFAULT; goto free_counters; } } t = ip6t_get_target_c(e); if (xt_target_to_user(t, userptr + off + e->target_offset)) { ret = -EFAULT; goto free_counters; } } free_counters: vfree(counters); return ret; } #ifdef CONFIG_COMPAT static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; if (v > 0) v += xt_compat_calc_jump(AF_INET6, v); memcpy(dst, &v, sizeof(v)); } static int compat_standard_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; if (cv > 0) cv -= xt_compat_calc_jump(AF_INET6, cv); return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } static int compat_calc_entry(const struct ip6t_entry *e, const struct xt_table_info *info, const void *base, struct xt_table_info *newinfo) { const struct xt_entry_match *ematch; const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); entry_offset = (void *)e - base; xt_ematch_foreach(ematch, e) off += xt_compat_match_offset(ematch->u.kernel.match); t = ip6t_get_target_c(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; ret = xt_compat_add_offset(AF_INET6, entry_offset, off); if (ret) return ret; for (i = 0; i < NF_INET_NUMHOOKS; i++) { if (info->hook_entry[i] && (e < (struct ip6t_entry *)(base + info->hook_entry[i]))) newinfo->hook_entry[i] -= off; if (info->underflow[i] && (e < (struct ip6t_entry *)(base + info->underflow[i]))) newinfo->underflow[i] -= off; } return 0; } static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { struct ip6t_entry *iter; const void *loc_cpu_entry; int ret; if (!newinfo || !info) return -EINVAL; /* we dont care about newinfo->entries */ memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries; ret = xt_compat_init_offsets(AF_INET6, info->number); if (ret) return ret; xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) return ret; } return 0; } #endif static int get_info(struct net *net, void __user *user, const int *len, int compat) { char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; if (*len != sizeof(struct ip6t_getinfo)) return -EINVAL; if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT if (compat) xt_compat_lock(AF_INET6); #endif t = xt_request_find_table_lock(net, AF_INET6, name); if (!IS_ERR(t)) { struct ip6t_getinfo info; const struct xt_table_info *private = t->private; #ifdef CONFIG_COMPAT struct xt_table_info tmp; if (compat) { ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET6); private = &tmp; } #endif memset(&info, 0, sizeof(info)); info.valid_hooks = t->valid_hooks; memcpy(info.hook_entry, private->hook_entry, sizeof(info.hook_entry)); memcpy(info.underflow, private->underflow, sizeof(info.underflow)); info.num_entries = private->number; info.size = private->size; strcpy(info.name, name); if (copy_to_user(user, &info, *len) != 0) ret = -EFAULT; else ret = 0; xt_table_unlock(t); module_put(t->me); } else ret = PTR_ERR(t); #ifdef CONFIG_COMPAT if (compat) xt_compat_unlock(AF_INET6); #endif return ret; } static int get_entries(struct net *net, struct ip6t_get_entries __user *uptr, const int *len) { int ret; struct ip6t_get_entries get; struct xt_table *t; if (*len < sizeof(get)) return -EINVAL; if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct ip6t_get_entries) + get.size) return -EINVAL; get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { struct xt_table_info *private = t->private; if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); else ret = -EAGAIN; module_put(t->me); xt_table_unlock(t); } else ret = PTR_ERR(t); return ret; } static int __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table_info *newinfo, unsigned int num_counters, void __user *counters_ptr) { int ret; struct xt_table *t; struct xt_table_info *oldinfo; struct xt_counters *counters; struct ip6t_entry *iter; ret = 0; counters = xt_counters_alloc(num_counters); if (!counters) { ret = -ENOMEM; goto out; } t = xt_request_find_table_lock(net, AF_INET6, name); if (IS_ERR(t)) { ret = PTR_ERR(t); goto free_newinfo_counters_untrans; } /* You lied! */ if (valid_hooks != t->valid_hooks) { ret = -EINVAL; goto put_module; } oldinfo = xt_replace_table(t, num_counters, newinfo, &ret); if (!oldinfo) goto put_module; /* Update module usage count based on number of rules */ if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); if ((oldinfo->number > oldinfo->initial_entries) && (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); xt_table_unlock(t); get_old_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ xt_entry_foreach(iter, oldinfo->entries, oldinfo->size) cleanup_entry(iter, net); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) { /* Silent error, can't fail, new table is already in place */ net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); } vfree(counters); return ret; put_module: module_put(t->me); xt_table_unlock(t); free_newinfo_counters_untrans: vfree(counters); out: return ret; } static int do_replace(struct net *net, const void __user *user, unsigned int len) { int ret; struct ip6t_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; struct ip6t_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } static int do_add_counters(struct net *net, const void __user *user, unsigned int len, int compat) { unsigned int i; struct xt_counters_info tmp; struct xt_counters *paddc; struct xt_table *t; const struct xt_table_info *private; int ret = 0; struct ip6t_entry *iter; unsigned int addend; paddc = xt_copy_counters_from_user(user, len, &tmp, compat); if (IS_ERR(paddc)) return PTR_ERR(paddc); t = xt_find_table_lock(net, AF_INET6, tmp.name); if (IS_ERR(t)) { ret = PTR_ERR(t); goto free; } local_bh_disable(); private = t->private; if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; } i = 0; addend = xt_write_recseq_begin(); xt_entry_foreach(iter, private->entries, private->size) { struct xt_counters *tmp; tmp = xt_get_this_cpu_counter(&iter->counters); ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt); ++i; } xt_write_recseq_end(addend); unlock_up_free: local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: vfree(paddc); return ret; } #ifdef CONFIG_COMPAT struct compat_ip6t_replace { char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; u32 hook_entry[NF_INET_NUMHOOKS]; u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; compat_uptr_t counters; /* struct xt_counters * */ struct compat_ip6t_entry entries[0]; }; static int compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr, unsigned int *size, struct xt_counters *counters, unsigned int i) { struct xt_entry_target *t; struct compat_ip6t_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; const struct xt_entry_match *ematch; int ret = 0; origsize = *size; ce = *dstptr; if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 || copy_to_user(&ce->counters, &counters[i], sizeof(counters[i])) != 0) return -EFAULT; *dstptr += sizeof(struct compat_ip6t_entry); *size -= sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); xt_ematch_foreach(ematch, e) { ret = xt_compat_match_to_user(ematch, dstptr, size); if (ret != 0) return ret; } target_offset = e->target_offset - (origsize - *size); t = ip6t_get_target(e); ret = xt_compat_target_to_user(t, dstptr, size); if (ret) return ret; next_offset = e->next_offset - (origsize - *size); if (put_user(target_offset, &ce->target_offset) != 0 || put_user(next_offset, &ce->next_offset) != 0) return -EFAULT; return 0; } static int compat_find_calc_match(struct xt_entry_match *m, const struct ip6t_ip6 *ipv6, int *size) { struct xt_match *match; match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); if (IS_ERR(match)) return PTR_ERR(match); m->u.kernel.match = match; *size += xt_compat_match_offset(match); return 0; } static void compat_release_entry(struct compat_ip6t_entry *e) { struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ xt_ematch_foreach(ematch, e) module_put(ematch->u.kernel.match->me); t = compat_ip6t_get_target(e); module_put(t->u.kernel.target->me); } static int check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, struct xt_table_info *newinfo, unsigned int *size, const unsigned char *base, const unsigned char *limit) { struct xt_entry_match *ematch; struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; unsigned int j; int ret, off; if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit || (unsigned char *)e + e->next_offset > limit) return -EINVAL; if (e->next_offset < sizeof(struct compat_ip6t_entry) + sizeof(struct compat_xt_entry_target)) return -EINVAL; if (!ip6_checkentry(&e->ipv6)) return -EINVAL; ret = xt_compat_check_entry_offsets(e, e->elems, e->target_offset, e->next_offset); if (ret) return ret; off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); entry_offset = (void *)e - (void *)base; j = 0; xt_ematch_foreach(ematch, e) { ret = compat_find_calc_match(ematch, &e->ipv6, &off); if (ret != 0) goto release_matches; ++j; } t = compat_ip6t_get_target(e); target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { ret = PTR_ERR(target); goto release_matches; } t->u.kernel.target = target; off += xt_compat_target_offset(target); *size += off; ret = xt_compat_add_offset(AF_INET6, entry_offset, off); if (ret) goto out; return 0; out: module_put(t->u.kernel.target->me); release_matches: xt_ematch_foreach(ematch, e) { if (j-- == 0) break; module_put(ematch->u.kernel.match->me); } return ret; } static void compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, unsigned int *size, struct xt_table_info *newinfo, unsigned char *base) { struct xt_entry_target *t; struct ip6t_entry *de; unsigned int origsize; int h; struct xt_entry_match *ematch; origsize = *size; de = *dstptr; memcpy(de, e, sizeof(struct ip6t_entry)); memcpy(&de->counters, &e->counters, sizeof(e->counters)); *dstptr += sizeof(struct ip6t_entry); *size += sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry); xt_ematch_foreach(ematch, e) xt_compat_match_from_user(ematch, dstptr, size); de->target_offset = e->target_offset - (origsize - *size); t = compat_ip6t_get_target(e); xt_compat_target_from_user(t, dstptr, size); de->next_offset = e->next_offset - (origsize - *size); for (h = 0; h < NF_INET_NUMHOOKS; h++) { if ((unsigned char *)de - base < newinfo->hook_entry[h]) newinfo->hook_entry[h] -= origsize - *size; if ((unsigned char *)de - base < newinfo->underflow[h]) newinfo->underflow[h] -= origsize - *size; } } static int translate_compat_table(struct net *net, struct xt_table_info **pinfo, void **pentry0, const struct compat_ip6t_replace *compatr) { unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; struct compat_ip6t_entry *iter0; struct ip6t_replace repl; unsigned int size; int ret; info = *pinfo; entry0 = *pentry0; size = compatr->size; info->number = compatr->num_entries; j = 0; xt_compat_lock(AF_INET6); ret = xt_compat_init_offsets(AF_INET6, compatr->num_entries); if (ret) goto out_unlock; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, compatr->size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, entry0, entry0 + compatr->size); if (ret != 0) goto out_unlock; ++j; } ret = -EINVAL; if (j != compatr->num_entries) goto out_unlock; ret = -ENOMEM; newinfo = xt_alloc_table_info(size); if (!newinfo) goto out_unlock; memset(newinfo->entries, 0, size); newinfo->number = compatr->num_entries; for (i = 0; i < NF_INET_NUMHOOKS; i++) { newinfo->hook_entry[i] = compatr->hook_entry[i]; newinfo->underflow[i] = compatr->underflow[i]; } entry1 = newinfo->entries; pos = entry1; size = compatr->size; xt_entry_foreach(iter0, entry0, compatr->size) compat_copy_entry_from_user(iter0, &pos, &size, newinfo, entry1); /* all module references in entry0 are now gone. */ xt_compat_flush_offsets(AF_INET6); xt_compat_unlock(AF_INET6); memcpy(&repl, compatr, sizeof(*compatr)); for (i = 0; i < NF_INET_NUMHOOKS; i++) { repl.hook_entry[i] = newinfo->hook_entry[i]; repl.underflow[i] = newinfo->underflow[i]; } repl.num_counters = 0; repl.counters = NULL; repl.size = newinfo->size; ret = translate_table(net, newinfo, entry1, &repl); if (ret) goto free_newinfo; *pinfo = newinfo; *pentry0 = entry1; xt_free_table_info(info); return 0; free_newinfo: xt_free_table_info(newinfo); return ret; out_unlock: xt_compat_flush_offsets(AF_INET6); xt_compat_unlock(AF_INET6); xt_entry_foreach(iter0, entry0, compatr->size) { if (j-- == 0) break; compat_release_entry(iter0); } return ret; } static int compat_do_replace(struct net *net, void __user *user, unsigned int len) { int ret; struct compat_ip6t_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; struct ip6t_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) return -EINVAL; tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; } ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) goto free_newinfo_untrans; return 0; free_newinfo_untrans: xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } static int compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IP6T_SO_SET_REPLACE: ret = compat_do_replace(sock_net(sk), user, len); break; case IP6T_SO_SET_ADD_COUNTERS: ret = do_add_counters(sock_net(sk), user, len, 1); break; default: ret = -EINVAL; } return ret; } struct compat_ip6t_get_entries { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_ip6t_entry entrytable[0]; }; static int compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; const struct xt_table_info *private = table->private; void __user *pos; unsigned int size; int ret = 0; unsigned int i = 0; struct ip6t_entry *iter; counters = alloc_counters(table); if (IS_ERR(counters)) return PTR_ERR(counters); pos = userptr; size = total_size; xt_entry_foreach(iter, private->entries, total_size) { ret = compat_copy_entry_to_user(iter, &pos, &size, counters, i++); if (ret != 0) break; } vfree(counters); return ret; } static int compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, int *len) { int ret; struct compat_ip6t_get_entries get; struct xt_table *t; if (*len < sizeof(get)) return -EINVAL; if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) return -EINVAL; get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); else if (!ret) ret = -EAGAIN; xt_compat_flush_offsets(AF_INET6); module_put(t->me); xt_table_unlock(t); } else ret = PTR_ERR(t); xt_compat_unlock(AF_INET6); return ret; } static int do_ip6t_get_ctl(struct sock *, int, void __user *, int *); static int compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IP6T_SO_GET_INFO: ret = get_info(sock_net(sk), user, len, 1); break; case IP6T_SO_GET_ENTRIES: ret = compat_get_entries(sock_net(sk), user, len); break; default: ret = do_ip6t_get_ctl(sk, cmd, user, len); } return ret; } #endif static int do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IP6T_SO_SET_REPLACE: ret = do_replace(sock_net(sk), user, len); break; case IP6T_SO_SET_ADD_COUNTERS: ret = do_add_counters(sock_net(sk), user, len, 0); break; default: ret = -EINVAL; } return ret; } static int do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { case IP6T_SO_GET_INFO: ret = get_info(sock_net(sk), user, len, 0); break; case IP6T_SO_GET_ENTRIES: ret = get_entries(sock_net(sk), user, len); break; case IP6T_SO_GET_REVISION_MATCH: case IP6T_SO_GET_REVISION_TARGET: { struct xt_get_revision rev; int target; if (*len != sizeof(rev)) { ret = -EINVAL; break; } if (copy_from_user(&rev, user, sizeof(rev)) != 0) { ret = -EFAULT; break; } rev.name[sizeof(rev.name)-1] = 0; if (cmd == IP6T_SO_GET_REVISION_TARGET) target = 1; else target = 0; try_then_request_module(xt_find_revision(AF_INET6, rev.name, rev.revision, target, &ret), "ip6t_%s", rev.name); break; } default: ret = -EINVAL; } return ret; } static void __ip6t_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; struct module *table_owner = table->me; struct ip6t_entry *iter; private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries; xt_entry_foreach(iter, loc_cpu_entry, private->size) cleanup_entry(iter, net); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); } int ip6t_register_table(struct net *net, const struct xt_table *table, const struct ip6t_replace *repl, const struct nf_hook_ops *ops, struct xt_table **res) { int ret; struct xt_table_info *newinfo; struct xt_table_info bootstrap = {0}; void *loc_cpu_entry; struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0) goto out_free; new_table = xt_register_table(net, table, &bootstrap, newinfo); if (IS_ERR(new_table)) { ret = PTR_ERR(new_table); goto out_free; } /* set res now, will see skbs right after nf_register_net_hooks */ WRITE_ONCE(*res, new_table); if (!ops) return 0; ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); if (ret != 0) { __ip6t_unregister_table(net, new_table); *res = NULL; } return ret; out_free: xt_free_table_info(newinfo); return ret; } void ip6t_unregister_table(struct net *net, struct xt_table *table, const struct nf_hook_ops *ops) { if (ops) nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); __ip6t_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ static inline bool icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, u_int8_t type, u_int8_t code, bool invert) { return (type == test_type && code >= min_code && code <= max_code) ^ invert; } static bool icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) { const struct icmp6hdr *ic; struct icmp6hdr _icmph; const struct ip6t_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (ic == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ par->hotdrop = true; return false; } return icmp6_type_code_match(icmpinfo->type, icmpinfo->code[0], icmpinfo->code[1], ic->icmp6_type, ic->icmp6_code, !!(icmpinfo->invflags&IP6T_ICMP_INV)); } /* Called when user tries to insert an entry of this type. */ static int icmp6_checkentry(const struct xt_mtchk_param *par) { const struct ip6t_icmp *icmpinfo = par->matchinfo; /* Must specify no unknown invflags */ return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0; } /* The built-in targets: standard (NULL) and error. */ static struct xt_target ip6t_builtin_tg[] __read_mostly = { { .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_IPV6, #ifdef CONFIG_COMPAT .compatsize = sizeof(compat_int_t), .compat_from_user = compat_standard_from_user, .compat_to_user = compat_standard_to_user, #endif }, { .name = XT_ERROR_TARGET, .target = ip6t_error, .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_IPV6, }, }; static struct nf_sockopt_ops ip6t_sockopts = { .pf = PF_INET6, .set_optmin = IP6T_BASE_CTL, .set_optmax = IP6T_SO_SET_MAX+1, .set = do_ip6t_set_ctl, #ifdef CONFIG_COMPAT .compat_set = compat_do_ip6t_set_ctl, #endif .get_optmin = IP6T_BASE_CTL, .get_optmax = IP6T_SO_GET_MAX+1, .get = do_ip6t_get_ctl, #ifdef CONFIG_COMPAT .compat_get = compat_do_ip6t_get_ctl, #endif .owner = THIS_MODULE, }; static struct xt_match ip6t_builtin_mt[] __read_mostly = { { .name = "icmp6", .match = icmp6_match, .matchsize = sizeof(struct ip6t_icmp), .checkentry = icmp6_checkentry, .proto = IPPROTO_ICMPV6, .family = NFPROTO_IPV6, .me = THIS_MODULE, }, }; static int __net_init ip6_tables_net_init(struct net *net) { return xt_proto_init(net, NFPROTO_IPV6); } static void __net_exit ip6_tables_net_exit(struct net *net) { xt_proto_fini(net, NFPROTO_IPV6); } static struct pernet_operations ip6_tables_net_ops = { .init = ip6_tables_net_init, .exit = ip6_tables_net_exit, }; static int __init ip6_tables_init(void) { int ret; ret = register_pernet_subsys(&ip6_tables_net_ops); if (ret < 0) goto err1; /* No one else will be downing sem now, so we won't sleep */ ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); if (ret < 0) goto err2; ret = xt_register_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); if (ret < 0) goto err4; /* Register setsockopt */ ret = nf_register_sockopt(&ip6t_sockopts); if (ret < 0) goto err5; return 0; err5: xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); err4: xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); err2: unregister_pernet_subsys(&ip6_tables_net_ops); err1: return ret; } static void __exit ip6_tables_fini(void) { nf_unregister_sockopt(&ip6t_sockopts); xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt)); xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg)); unregister_pernet_subsys(&ip6_tables_net_ops); } EXPORT_SYMBOL(ip6t_register_table); EXPORT_SYMBOL(ip6t_unregister_table); EXPORT_SYMBOL(ip6t_do_table); module_init(ip6_tables_init); module_exit(ip6_tables_fini); |
13 11 1 2265 13 19 25 2 23 2 1 2 1 2 1 23 5 4 5 2 5 2 23 5 3 5 3 5 2 21 7 2 7 3 4 5 15 16 8 5 4 3 3 2 1 1 13 8 5 4 3 3 2 1 1 10 4 2 17 2 26 5 19 18 18 17 5 1 1 5 4 4 3 2 2 1 1 19 1 1 11 10 10 9 2 1 1 1 11 274 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 | /* * gw.c - CAN frame Gateway/Router/Bridge with netlink interface * * Copyright (c) 2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/gw.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> #define CAN_GW_VERSION "20170425" #define CAN_GW_NAME "can-gw" MODULE_DESCRIPTION("PF_CAN netlink gateway"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS(CAN_GW_NAME); #define CGW_MIN_HOPS 1 #define CGW_MAX_HOPS 6 #define CGW_DEFAULT_HOPS 1 static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS; module_param(max_hops, uint, 0444); MODULE_PARM_DESC(max_hops, "maximum " CAN_GW_NAME " routing hops for CAN frames " "(valid values: " __stringify(CGW_MIN_HOPS) "-" __stringify(CGW_MAX_HOPS) " hops, " "default: " __stringify(CGW_DEFAULT_HOPS) ")"); static struct notifier_block notifier; static struct kmem_cache *cgw_cache __read_mostly; /* structure that contains the (on-the-fly) CAN frame modifications */ struct cf_mod { struct { struct can_frame and; struct can_frame or; struct can_frame xor; struct can_frame set; } modframe; struct { u8 and; u8 or; u8 xor; u8 set; } modtype; void (*modfunc[MAX_MODFUNCTIONS])(struct can_frame *cf, struct cf_mod *mod); /* CAN frame checksum calculation after CAN frame modifications */ struct { struct cgw_csum_xor xor; struct cgw_csum_crc8 crc8; } csum; struct { void (*xor)(struct can_frame *cf, struct cgw_csum_xor *xor); void (*crc8)(struct can_frame *cf, struct cgw_csum_crc8 *crc8); } csumfunc; u32 uid; }; /* * So far we just support CAN -> CAN routing and frame modifications. * * The internal can_can_gw structure contains data and attributes for * a CAN -> CAN gateway job. */ struct can_can_gw { struct can_filter filter; int src_idx; int dst_idx; }; /* list entry for CAN gateways jobs */ struct cgw_job { struct hlist_node list; struct rcu_head rcu; u32 handled_frames; u32 dropped_frames; u32 deleted_frames; struct cf_mod mod; union { /* CAN frame data source */ struct net_device *dev; } src; union { /* CAN frame data destination */ struct net_device *dev; } dst; union { struct can_can_gw ccgw; /* tbc */ }; u8 gwtype; u8 limit_hops; u16 flags; }; /* modification functions that are invoked in the hot path in can_can_gw_rcv */ #define MODFUNC(func, op) static void func(struct can_frame *cf, \ struct cf_mod *mod) { op ; } MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id) MODFUNC(mod_and_dlc, cf->can_dlc &= mod->modframe.and.can_dlc) MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data) MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id) MODFUNC(mod_or_dlc, cf->can_dlc |= mod->modframe.or.can_dlc) MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data) MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id) MODFUNC(mod_xor_dlc, cf->can_dlc ^= mod->modframe.xor.can_dlc) MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data) MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id) MODFUNC(mod_set_dlc, cf->can_dlc = mod->modframe.set.can_dlc) MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data) static inline void canframecpy(struct can_frame *dst, struct can_frame *src) { /* * Copy the struct members separately to ensure that no uninitialized * data are copied in the 3 bytes hole of the struct. This is needed * to make easy compares of the data in the struct cf_mod. */ dst->can_id = src->can_id; dst->can_dlc = src->can_dlc; *(u64 *)dst->data = *(u64 *)src->data; } static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re) { /* * absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0] * relative to received dlc -1 .. -8 : * e.g. for received dlc = 8 * -1 => index = 7 (data[7]) * -3 => index = 5 (data[5]) * -8 => index = 0 (data[0]) */ if (fr > -9 && fr < 8 && to > -9 && to < 8 && re > -9 && re < 8) return 0; else return -EINVAL; } static inline int calc_idx(int idx, int rx_dlc) { if (idx < 0) return rx_dlc + idx; else return idx; } static void cgw_csum_xor_rel(struct can_frame *cf, struct cgw_csum_xor *xor) { int from = calc_idx(xor->from_idx, cf->can_dlc); int to = calc_idx(xor->to_idx, cf->can_dlc); int res = calc_idx(xor->result_idx, cf->can_dlc); u8 val = xor->init_xor_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = from; i <= to; i++) val ^= cf->data[i]; } else { for (i = from; i >= to; i--) val ^= cf->data[i]; } cf->data[res] = val; } static void cgw_csum_xor_pos(struct can_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i <= xor->to_idx; i++) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_xor_neg(struct can_frame *cf, struct cgw_csum_xor *xor) { u8 val = xor->init_xor_val; int i; for (i = xor->from_idx; i >= xor->to_idx; i--) val ^= cf->data[i]; cf->data[xor->result_idx] = val; } static void cgw_csum_crc8_rel(struct can_frame *cf, struct cgw_csum_crc8 *crc8) { int from = calc_idx(crc8->from_idx, cf->can_dlc); int to = calc_idx(crc8->to_idx, cf->can_dlc); int res = calc_idx(crc8->result_idx, cf->can_dlc); u8 crc = crc8->init_crc_val; int i; if (from < 0 || to < 0 || res < 0) return; if (from <= to) { for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc^cf->data[i]]; } else { for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc^cf->data[i]]; } switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc^crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc^(cf->can_id & 0xFF)^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc^crc8->final_xor_val; } static void cgw_csum_crc8_pos(struct can_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i <= crc8->to_idx; i++) crc = crc8->crctab[crc^cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc^crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc^(cf->can_id & 0xFF)^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc^crc8->final_xor_val; } static void cgw_csum_crc8_neg(struct can_frame *cf, struct cgw_csum_crc8 *crc8) { u8 crc = crc8->init_crc_val; int i; for (i = crc8->from_idx; i >= crc8->to_idx; i--) crc = crc8->crctab[crc^cf->data[i]]; switch (crc8->profile) { case CGW_CRC8PRF_1U8: crc = crc8->crctab[crc^crc8->profile_data[0]]; break; case CGW_CRC8PRF_16U8: crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]]; break; case CGW_CRC8PRF_SFFID_XOR: crc = crc8->crctab[crc^(cf->can_id & 0xFF)^ (cf->can_id >> 8 & 0xFF)]; break; } cf->data[crc8->result_idx] = crc^crc8->final_xor_val; } /* the receive & process & send function */ static void can_can_gw_rcv(struct sk_buff *skb, void *data) { struct cgw_job *gwj = (struct cgw_job *)data; struct can_frame *cf; struct sk_buff *nskb; int modidx = 0; /* * Do not handle CAN frames routed more than 'max_hops' times. * In general we should never catch this delimiter which is intended * to cover a misconfiguration protection (e.g. circular CAN routes). * * The Controller Area Network controllers only accept CAN frames with * correct CRCs - which are not visible in the controller registers. * According to skbuff.h documentation the csum_start element for IP * checksums is undefined/unused when ip_summed == CHECKSUM_UNNECESSARY. * Only CAN skbs can be processed here which already have this property. */ #define cgw_hops(skb) ((skb)->csum_start) BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY); if (cgw_hops(skb) >= max_hops) { /* indicate deleted frames due to misconfiguration */ gwj->deleted_frames++; return; } if (!(gwj->dst.dev->flags & IFF_UP)) { gwj->dropped_frames++; return; } /* is sending the skb back to the incoming interface not allowed? */ if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) && can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex) return; /* * clone the given skb, which has not been done in can_rcv() * * When there is at least one modification function activated, * we need to copy the skb as we want to modify skb->data. */ if (gwj->mod.modfunc[0]) nskb = skb_copy(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) { gwj->dropped_frames++; return; } /* put the incremented hop counter in the cloned skb */ cgw_hops(nskb) = cgw_hops(skb) + 1; /* first processing of this CAN frame -> adjust to private hop limit */ if (gwj->limit_hops && cgw_hops(nskb) == 1) cgw_hops(nskb) = max_hops - gwj->limit_hops + 1; nskb->dev = gwj->dst.dev; /* pointer to modifiable CAN frame */ cf = (struct can_frame *)nskb->data; /* perform preprocessed modification functions if there are any */ while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx]) (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod); /* Has the CAN frame been modified? */ if (modidx) { /* get available space for the processed CAN frame type */ int max_len = nskb->len - offsetof(struct can_frame, data); /* dlc may have changed, make sure it fits to the CAN frame */ if (cf->can_dlc > max_len) goto out_delete; /* check for checksum updates in classic CAN length only */ if (gwj->mod.csumfunc.crc8) { if (cf->can_dlc > 8) goto out_delete; (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); } if (gwj->mod.csumfunc.xor) { if (cf->can_dlc > 8) goto out_delete; (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor); } } /* clear the skb timestamp if not configured the other way */ if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP)) nskb->tstamp = 0; /* send to netdevice */ if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO)) gwj->dropped_frames++; else gwj->handled_frames++; return; out_delete: /* delete frame due to misconfiguration */ gwj->deleted_frames++; kfree_skb(nskb); return; } static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj) { return can_rx_register(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj, "gw", NULL); } static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj) { can_rx_unregister(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj); } static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg == NETDEV_UNREGISTER) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); } } } return NOTIFY_DONE; } static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, u32 pid, u32 seq, int flags) { struct cgw_frame_mod mb; struct rtcanmsg *rtcan; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags); if (!nlh) return -EMSGSIZE; rtcan = nlmsg_data(nlh); rtcan->can_family = AF_CAN; rtcan->gwtype = gwj->gwtype; rtcan->flags = gwj->flags; /* add statistics if available */ if (gwj->handled_frames) { if (nla_put_u32(skb, CGW_HANDLED, gwj->handled_frames) < 0) goto cancel; } if (gwj->dropped_frames) { if (nla_put_u32(skb, CGW_DROPPED, gwj->dropped_frames) < 0) goto cancel; } if (gwj->deleted_frames) { if (nla_put_u32(skb, CGW_DELETED, gwj->deleted_frames) < 0) goto cancel; } /* check non default settings of attributes */ if (gwj->limit_hops) { if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0) goto cancel; } if (gwj->mod.modtype.and) { memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.and; if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.or) { memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.or; if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.xor) { memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.xor; if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.modtype.set) { memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); mb.modtype = gwj->mod.modtype.set; if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0) goto cancel; } if (gwj->mod.uid) { if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0) goto cancel; } if (gwj->mod.csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, &gwj->mod.csum.crc8) < 0) goto cancel; } if (gwj->mod.csumfunc.xor) { if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN, &gwj->mod.csum.xor) < 0) goto cancel; } if (gwj->gwtype == CGW_TYPE_CAN_CAN) { if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) { if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter), &gwj->ccgw.filter) < 0) goto cancel; } if (nla_put_u32(skb, CGW_SRC_IF, gwj->ccgw.src_idx) < 0) goto cancel; if (nla_put_u32(skb, CGW_DST_IF, gwj->ccgw.dst_idx) < 0) goto cancel; } nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* Dump information about all CAN gateway jobs, in response to RTM_GETROUTE */ static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; int idx = 0; int s_idx = cb->args[0]; rcu_read_lock(); hlist_for_each_entry_rcu(gwj, &net->can.cgw_list, list) { if (idx < s_idx) goto cont; if (cgw_put_job(skb, gwj, RTM_NEWROUTE, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0) break; cont: idx++; } rcu_read_unlock(); cb->args[0] = idx; return skb->len; } static const struct nla_policy cgw_policy[CGW_MAX+1] = { [CGW_MOD_AND] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_OR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_XOR] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_MOD_SET] = { .len = sizeof(struct cgw_frame_mod) }, [CGW_CS_XOR] = { .len = sizeof(struct cgw_csum_xor) }, [CGW_CS_CRC8] = { .len = sizeof(struct cgw_csum_crc8) }, [CGW_SRC_IF] = { .type = NLA_U32 }, [CGW_DST_IF] = { .type = NLA_U32 }, [CGW_FILTER] = { .len = sizeof(struct can_filter) }, [CGW_LIM_HOPS] = { .type = NLA_U8 }, [CGW_MOD_UID] = { .type = NLA_U32 }, }; /* check for common and gwtype specific attributes */ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, u8 gwtype, void *gwtypeattr, u8 *limhops) { struct nlattr *tb[CGW_MAX+1]; struct cgw_frame_mod mb; int modidx = 0; int err = 0; /* initialize modification & checksum data space */ memset(mod, 0, sizeof(*mod)); err = nlmsg_parse(nlh, sizeof(struct rtcanmsg), tb, CGW_MAX, cgw_policy, NULL); if (err < 0) return err; if (tb[CGW_LIM_HOPS]) { *limhops = nla_get_u8(tb[CGW_LIM_HOPS]); if (*limhops < 1 || *limhops > max_hops) return -EINVAL; } /* check for AND/OR/XOR/SET modifications */ if (tb[CGW_MOD_AND]) { nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN); canframecpy(&mod->modframe.and, &mb.cf); mod->modtype.and = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_and_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_and_dlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_and_data; } if (tb[CGW_MOD_OR]) { nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.or, &mb.cf); mod->modtype.or = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_or_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_or_dlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_or_data; } if (tb[CGW_MOD_XOR]) { nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN); canframecpy(&mod->modframe.xor, &mb.cf); mod->modtype.xor = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_xor_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_xor_dlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_xor_data; } if (tb[CGW_MOD_SET]) { nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN); canframecpy(&mod->modframe.set, &mb.cf); mod->modtype.set = mb.modtype; if (mb.modtype & CGW_MOD_ID) mod->modfunc[modidx++] = mod_set_id; if (mb.modtype & CGW_MOD_DLC) mod->modfunc[modidx++] = mod_set_dlc; if (mb.modtype & CGW_MOD_DATA) mod->modfunc[modidx++] = mod_set_data; } /* check for checksum operations after CAN frame modifications */ if (modidx) { if (tb[CGW_CS_CRC8]) { struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx); if (err) return err; nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8], CGW_CS_CRC8_LEN); /* * select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.crc8 = cgw_csum_crc8_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.crc8 = cgw_csum_crc8_pos; else mod->csumfunc.crc8 = cgw_csum_crc8_neg; } if (tb[CGW_CS_XOR]) { struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]); err = cgw_chk_csum_parms(c->from_idx, c->to_idx, c->result_idx); if (err) return err; nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR], CGW_CS_XOR_LEN); /* * select dedicated processing function to reduce * runtime operations in receive hot path. */ if (c->from_idx < 0 || c->to_idx < 0 || c->result_idx < 0) mod->csumfunc.xor = cgw_csum_xor_rel; else if (c->from_idx <= c->to_idx) mod->csumfunc.xor = cgw_csum_xor_pos; else mod->csumfunc.xor = cgw_csum_xor_neg; } if (tb[CGW_MOD_UID]) { nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32)); } } if (gwtype == CGW_TYPE_CAN_CAN) { /* check CGW_TYPE_CAN_CAN specific attributes */ struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr; memset(ccgw, 0, sizeof(*ccgw)); /* check for can_filter in attributes */ if (tb[CGW_FILTER]) nla_memcpy(&ccgw->filter, tb[CGW_FILTER], sizeof(struct can_filter)); err = -ENODEV; /* specifying two interfaces is mandatory */ if (!tb[CGW_SRC_IF] || !tb[CGW_DST_IF]) return err; ccgw->src_idx = nla_get_u32(tb[CGW_SRC_IF]); ccgw->dst_idx = nla_get_u32(tb[CGW_DST_IF]); /* both indices set to 0 for flushing all routing entries */ if (!ccgw->src_idx && !ccgw->dst_idx) return 0; /* only one index set to 0 is an error */ if (!ccgw->src_idx || !ccgw->dst_idx) return err; } /* add the checks for other gwtypes here */ return 0; } static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct rtcanmsg *r; struct cgw_job *gwj; struct cf_mod mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; if (mod.uid) { ASSERT_RTNL(); /* check for updating an existing job with identical uid */ hlist_for_each_entry(gwj, &net->can.cgw_list, list) { if (gwj->mod.uid != mod.uid) continue; /* interfaces & filters must be identical */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) return -EINVAL; /* update modifications with disabled softirq & quit */ local_bh_disable(); memcpy(&gwj->mod, &mod, sizeof(mod)); local_bh_enable(); return 0; } } /* ifindex == 0 is not allowed for job creation */ if (!ccgw.src_idx || !ccgw.dst_idx) return -ENODEV; gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); if (!gwj) return -ENOMEM; gwj->handled_frames = 0; gwj->dropped_frames = 0; gwj->deleted_frames = 0; gwj->flags = r->flags; gwj->gwtype = r->gwtype; gwj->limit_hops = limhops; /* insert already parsed information */ memcpy(&gwj->mod, &mod, sizeof(mod)); memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; gwj->src.dev = __dev_get_by_index(net, gwj->ccgw.src_idx); if (!gwj->src.dev) goto out; if (gwj->src.dev->type != ARPHRD_CAN) goto out; gwj->dst.dev = __dev_get_by_index(net, gwj->ccgw.dst_idx); if (!gwj->dst.dev) goto out; if (gwj->dst.dev->type != ARPHRD_CAN) goto out; ASSERT_RTNL(); err = cgw_register_filter(net, gwj); if (!err) hlist_add_head_rcu(&gwj->list, &net->can.cgw_list); out: if (err) kmem_cache_free(cgw_cache, gwj); return err; } static void cgw_remove_all_jobs(struct net *net) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); } } static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; struct hlist_node *nx; struct rtcanmsg *r; struct cf_mod mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (nlmsg_len(nlh) < sizeof(*r)) return -EINVAL; r = nlmsg_data(nlh); if (r->can_family != AF_CAN) return -EPFNOSUPPORT; /* so far we only support CAN -> CAN routings */ if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) return err; /* two interface indices both set to 0 => remove all entries */ if (!ccgw.src_idx && !ccgw.dst_idx) { cgw_remove_all_jobs(net); return 0; } err = -EINVAL; ASSERT_RTNL(); /* remove only the first matching entry */ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->flags != r->flags) continue; if (gwj->limit_hops != limhops) continue; /* we have a match when uid is enabled and identical */ if (gwj->mod.uid || mod.uid) { if (gwj->mod.uid != mod.uid) continue; } else { /* no uid => check for identical modifications */ if (memcmp(&gwj->mod, &mod, sizeof(mod))) continue; } /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) continue; hlist_del(&gwj->list); cgw_unregister_filter(net, gwj); synchronize_rcu(); kmem_cache_free(cgw_cache, gwj); err = 0; break; } return err; } static int __net_init cangw_pernet_init(struct net *net) { INIT_HLIST_HEAD(&net->can.cgw_list); return 0; } static void __net_exit cangw_pernet_exit(struct net *net) { rtnl_lock(); cgw_remove_all_jobs(net); rtnl_unlock(); } static struct pernet_operations cangw_pernet_ops = { .init = cangw_pernet_init, .exit = cangw_pernet_exit, }; static __init int cgw_module_init(void) { int ret; /* sanitize given module parameter */ max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS); pr_info("can: netlink gateway (rev " CAN_GW_VERSION ") max_hops=%d\n", max_hops); ret = register_pernet_subsys(&cangw_pernet_ops); if (ret) return ret; ret = -ENOMEM; cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job), 0, 0, NULL); if (!cgw_cache) goto out_cache_create; /* set notifier */ notifier.notifier_call = cgw_notifier; ret = register_netdevice_notifier(¬ifier); if (ret) goto out_register_notifier; ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_GETROUTE, NULL, cgw_dump_jobs, 0); if (ret) goto out_rtnl_register1; ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_NEWROUTE, cgw_create_job, NULL, 0); if (ret) goto out_rtnl_register2; ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_DELROUTE, cgw_remove_job, NULL, 0); if (ret) goto out_rtnl_register3; return 0; out_rtnl_register3: rtnl_unregister(PF_CAN, RTM_NEWROUTE); out_rtnl_register2: rtnl_unregister(PF_CAN, RTM_GETROUTE); out_rtnl_register1: unregister_netdevice_notifier(¬ifier); out_register_notifier: kmem_cache_destroy(cgw_cache); out_cache_create: unregister_pernet_subsys(&cangw_pernet_ops); return ret; } static __exit void cgw_module_exit(void) { rtnl_unregister_all(PF_CAN); unregister_netdevice_notifier(¬ifier); unregister_pernet_subsys(&cangw_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(cgw_cache); } module_init(cgw_module_init); module_exit(cgw_module_exit); |
553 553 552 551 551 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 | /* * Mapping of UID/GIDs to name and vice versa. * * Copyright (c) 2002, 2003 The Regents of the University of * Michigan. All rights reserved. * * Marius Aamodt Eriksen <marius@umich.edu> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/module.h> #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/sunrpc/svc_xprt.h> #include <net/net_namespace.h> #include "idmap.h" #include "nfsd.h" #include "netns.h" /* * Turn off idmapping when using AUTH_SYS. */ static bool nfs4_disable_idmapping = true; module_param(nfs4_disable_idmapping, bool, 0644); MODULE_PARM_DESC(nfs4_disable_idmapping, "Turn off server's NFSv4 idmapping when using 'sec=sys'"); /* * Cache entry */ /* * XXX we know that IDMAP_NAMESZ < PAGE_SIZE, but it's ugly to rely on * that. */ struct ent { struct cache_head h; int type; /* User / Group */ u32 id; char name[IDMAP_NAMESZ]; char authname[IDMAP_NAMESZ]; }; /* Common entry handling */ #define ENT_HASHBITS 8 #define ENT_HASHMAX (1 << ENT_HASHBITS) static void ent_init(struct cache_head *cnew, struct cache_head *citm) { struct ent *new = container_of(cnew, struct ent, h); struct ent *itm = container_of(citm, struct ent, h); new->id = itm->id; new->type = itm->type; strlcpy(new->name, itm->name, sizeof(new->name)); strlcpy(new->authname, itm->authname, sizeof(new->name)); } static void ent_put(struct kref *ref) { struct ent *map = container_of(ref, struct ent, h.ref); kfree(map); } static struct cache_head * ent_alloc(void) { struct ent *e = kmalloc(sizeof(*e), GFP_KERNEL); if (e) return &e->h; else return NULL; } /* * ID -> Name cache */ static uint32_t idtoname_hash(struct ent *ent) { uint32_t hash; hash = hash_str(ent->authname, ENT_HASHBITS); hash = hash_long(hash ^ ent->id, ENT_HASHBITS); /* Flip LSB for user/group */ if (ent->type == IDMAP_TYPE_GROUP) hash ^= 1; return hash; } static void idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) { struct ent *ent = container_of(ch, struct ent, h); char idstr[11]; qword_add(bpp, blen, ent->authname); snprintf(idstr, sizeof(idstr), "%u", ent->id); qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user"); qword_add(bpp, blen, idstr); (*bpp)[-1] = '\n'; } static int idtoname_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); struct ent *b = container_of(cb, struct ent, h); return (a->id == b->id && a->type == b->type && strcmp(a->authname, b->authname) == 0); } static int idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct ent *ent; if (h == NULL) { seq_puts(m, "#domain type id [name]\n"); return 0; } ent = container_of(h, struct ent, h); seq_printf(m, "%s %s %u", ent->authname, ent->type == IDMAP_TYPE_GROUP ? "group" : "user", ent->id); if (test_bit(CACHE_VALID, &h->flags)) seq_printf(m, " %s", ent->name); seq_printf(m, "\n"); return 0; } static void warn_no_idmapd(struct cache_detail *detail, int has_died) { printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", has_died ? "died" : "not been started"); } static int idtoname_parse(struct cache_detail *, char *, int); static struct ent *idtoname_lookup(struct cache_detail *, struct ent *); static struct ent *idtoname_update(struct cache_detail *, struct ent *, struct ent *); static const struct cache_detail idtoname_cache_template = { .owner = THIS_MODULE, .hash_size = ENT_HASHMAX, .name = "nfs4.idtoname", .cache_put = ent_put, .cache_request = idtoname_request, .cache_parse = idtoname_parse, .cache_show = idtoname_show, .warn_no_listener = warn_no_idmapd, .match = idtoname_match, .init = ent_init, .update = ent_init, .alloc = ent_alloc, }; static int idtoname_parse(struct cache_detail *cd, char *buf, int buflen) { struct ent ent, *res; char *buf1, *bp; int len; int error = -EINVAL; if (buf[buflen - 1] != '\n') return (-EINVAL); buf[buflen - 1]= '\0'; buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL); if (buf1 == NULL) return (-ENOMEM); memset(&ent, 0, sizeof(ent)); /* Authentication name */ len = qword_get(&buf, buf1, PAGE_SIZE); if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.authname, buf1, sizeof(ent.authname)); /* Type */ if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) goto out; ent.type = strcmp(buf1, "user") == 0 ? IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; /* ID */ if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) goto out; ent.id = simple_strtoul(buf1, &bp, 10); if (bp == buf1) goto out; /* expiry */ ent.h.expiry_time = get_expiry(&buf); if (ent.h.expiry_time == 0) goto out; error = -ENOMEM; res = idtoname_lookup(cd, &ent); if (!res) goto out; /* Name */ error = -EINVAL; len = qword_get(&buf, buf1, PAGE_SIZE); if (len < 0 || len >= IDMAP_NAMESZ) goto out; if (len == 0) set_bit(CACHE_NEGATIVE, &ent.h.flags); else memcpy(ent.name, buf1, sizeof(ent.name)); error = -ENOMEM; res = idtoname_update(cd, &ent, res); if (res == NULL) goto out; cache_put(&res->h, cd); error = 0; out: kfree(buf1); return error; } static struct ent * idtoname_lookup(struct cache_detail *cd, struct ent *item) { struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h, idtoname_hash(item)); if (ch) return container_of(ch, struct ent, h); else return NULL; } static struct ent * idtoname_update(struct cache_detail *cd, struct ent *new, struct ent *old) { struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h, idtoname_hash(new)); if (ch) return container_of(ch, struct ent, h); else return NULL; } /* * Name -> ID cache */ static inline int nametoid_hash(struct ent *ent) { return hash_str(ent->name, ENT_HASHBITS); } static void nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) { struct ent *ent = container_of(ch, struct ent, h); qword_add(bpp, blen, ent->authname); qword_add(bpp, blen, ent->type == IDMAP_TYPE_GROUP ? "group" : "user"); qword_add(bpp, blen, ent->name); (*bpp)[-1] = '\n'; } static int nametoid_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); struct ent *b = container_of(cb, struct ent, h); return (a->type == b->type && strcmp(a->name, b->name) == 0 && strcmp(a->authname, b->authname) == 0); } static int nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct ent *ent; if (h == NULL) { seq_puts(m, "#domain type name [id]\n"); return 0; } ent = container_of(h, struct ent, h); seq_printf(m, "%s %s %s", ent->authname, ent->type == IDMAP_TYPE_GROUP ? "group" : "user", ent->name); if (test_bit(CACHE_VALID, &h->flags)) seq_printf(m, " %u", ent->id); seq_printf(m, "\n"); return 0; } static struct ent *nametoid_lookup(struct cache_detail *, struct ent *); static struct ent *nametoid_update(struct cache_detail *, struct ent *, struct ent *); static int nametoid_parse(struct cache_detail *, char *, int); static const struct cache_detail nametoid_cache_template = { .owner = THIS_MODULE, .hash_size = ENT_HASHMAX, .name = "nfs4.nametoid", .cache_put = ent_put, .cache_request = nametoid_request, .cache_parse = nametoid_parse, .cache_show = nametoid_show, .warn_no_listener = warn_no_idmapd, .match = nametoid_match, .init = ent_init, .update = ent_init, .alloc = ent_alloc, }; static int nametoid_parse(struct cache_detail *cd, char *buf, int buflen) { struct ent ent, *res; char *buf1; int len, error = -EINVAL; if (buf[buflen - 1] != '\n') return (-EINVAL); buf[buflen - 1]= '\0'; buf1 = kmalloc(PAGE_SIZE, GFP_KERNEL); if (buf1 == NULL) return (-ENOMEM); memset(&ent, 0, sizeof(ent)); /* Authentication name */ len = qword_get(&buf, buf1, PAGE_SIZE); if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.authname, buf1, sizeof(ent.authname)); /* Type */ if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) goto out; ent.type = strcmp(buf1, "user") == 0 ? IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; /* Name */ len = qword_get(&buf, buf1, PAGE_SIZE); if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.name, buf1, sizeof(ent.name)); /* expiry */ ent.h.expiry_time = get_expiry(&buf); if (ent.h.expiry_time == 0) goto out; /* ID */ error = get_int(&buf, &ent.id); if (error == -EINVAL) goto out; if (error == -ENOENT) set_bit(CACHE_NEGATIVE, &ent.h.flags); error = -ENOMEM; res = nametoid_lookup(cd, &ent); if (res == NULL) goto out; res = nametoid_update(cd, &ent, res); if (res == NULL) goto out; cache_put(&res->h, cd); error = 0; out: kfree(buf1); return (error); } static struct ent * nametoid_lookup(struct cache_detail *cd, struct ent *item) { struct cache_head *ch = sunrpc_cache_lookup(cd, &item->h, nametoid_hash(item)); if (ch) return container_of(ch, struct ent, h); else return NULL; } static struct ent * nametoid_update(struct cache_detail *cd, struct ent *new, struct ent *old) { struct cache_head *ch = sunrpc_cache_update(cd, &new->h, &old->h, nametoid_hash(new)); if (ch) return container_of(ch, struct ent, h); else return NULL; } /* * Exported API */ int nfsd_idmap_init(struct net *net) { int rv; struct nfsd_net *nn = net_generic(net, nfsd_net_id); nn->idtoname_cache = cache_create_net(&idtoname_cache_template, net); if (IS_ERR(nn->idtoname_cache)) return PTR_ERR(nn->idtoname_cache); rv = cache_register_net(nn->idtoname_cache, net); if (rv) goto destroy_idtoname_cache; nn->nametoid_cache = cache_create_net(&nametoid_cache_template, net); if (IS_ERR(nn->nametoid_cache)) { rv = PTR_ERR(nn->nametoid_cache); goto unregister_idtoname_cache; } rv = cache_register_net(nn->nametoid_cache, net); if (rv) goto destroy_nametoid_cache; return 0; destroy_nametoid_cache: cache_destroy_net(nn->nametoid_cache, net); unregister_idtoname_cache: cache_unregister_net(nn->idtoname_cache, net); destroy_idtoname_cache: cache_destroy_net(nn->idtoname_cache, net); return rv; } void nfsd_idmap_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); cache_unregister_net(nn->idtoname_cache, net); cache_unregister_net(nn->nametoid_cache, net); cache_destroy_net(nn->idtoname_cache, net); cache_destroy_net(nn->nametoid_cache, net); } static int idmap_lookup(struct svc_rqst *rqstp, struct ent *(*lookup_fn)(struct cache_detail *, struct ent *), struct ent *key, struct cache_detail *detail, struct ent **item) { int ret; *item = lookup_fn(detail, key); if (!*item) return -ENOMEM; retry: ret = cache_check(detail, &(*item)->h, &rqstp->rq_chandle); if (ret == -ETIMEDOUT) { struct ent *prev_item = *item; *item = lookup_fn(detail, key); if (*item != prev_item) goto retry; cache_put(&(*item)->h, detail); } return ret; } static char * rqst_authname(struct svc_rqst *rqstp) { struct auth_domain *clp; clp = rqstp->rq_gssclient ? rqstp->rq_gssclient : rqstp->rq_client; return clp->name; } static __be32 idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { struct ent *item, key = { .type = type, }; int ret; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); if (namelen + 1 > sizeof(key.name)) return nfserr_badowner; memcpy(key.name, name, namelen); key.name[namelen] = '\0'; strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item); if (ret == -ENOENT) return nfserr_badowner; if (ret) return nfserrno(ret); *id = item->id; cache_put(&item->h, nn->nametoid_cache); return 0; } static __be32 encode_ascii_id(struct xdr_stream *xdr, u32 id) { char buf[11]; int len; __be32 *p; len = sprintf(buf, "%u", id); p = xdr_reserve_space(xdr, len + 4); if (!p) return nfserr_resource; p = xdr_encode_opaque(p, buf, len); return 0; } static __be32 idmap_id_to_name(struct xdr_stream *xdr, struct svc_rqst *rqstp, int type, u32 id) { struct ent *item, key = { .id = id, .type = type, }; __be32 *p; int ret; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname)); ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item); if (ret == -ENOENT) return encode_ascii_id(xdr, id); if (ret) return nfserrno(ret); ret = strlen(item->name); WARN_ON_ONCE(ret > IDMAP_NAMESZ); p = xdr_reserve_space(xdr, ret + 4); if (!p) return nfserr_resource; p = xdr_encode_opaque(p, item->name, ret); cache_put(&item->h, nn->idtoname_cache); return 0; } static bool numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { int ret; char buf[11]; if (namelen + 1 > sizeof(buf)) /* too long to represent a 32-bit id: */ return false; /* Just to make sure it's null-terminated: */ memcpy(buf, name, namelen); buf[namelen] = '\0'; ret = kstrtouint(buf, 10, id); return ret == 0; } static __be32 do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u32 *id) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) if (numeric_name_to_id(rqstp, type, name, namelen, id)) return 0; /* * otherwise, fall through and try idmapping, for * backwards compatibility with clients sending names: */ return idmap_name_to_id(rqstp, type, name, namelen, id); } static __be32 encode_name_from_id(struct xdr_stream *xdr, struct svc_rqst *rqstp, int type, u32 id) { if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS) return encode_ascii_id(xdr, id); return idmap_id_to_name(xdr, rqstp, type, id); } __be32 nfsd_map_name_to_uid(struct svc_rqst *rqstp, const char *name, size_t namelen, kuid_t *uid) { __be32 status; u32 id = -1; if (name == NULL || namelen == 0) return nfserr_inval; status = do_name_to_id(rqstp, IDMAP_TYPE_USER, name, namelen, &id); *uid = make_kuid(&init_user_ns, id); if (!uid_valid(*uid)) status = nfserr_badowner; return status; } __be32 nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen, kgid_t *gid) { __be32 status; u32 id = -1; if (name == NULL || namelen == 0) return nfserr_inval; status = do_name_to_id(rqstp, IDMAP_TYPE_GROUP, name, namelen, &id); *gid = make_kgid(&init_user_ns, id); if (!gid_valid(*gid)) status = nfserr_badowner; return status; } __be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp, kuid_t uid) { u32 id = from_kuid(&init_user_ns, uid); return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id); } __be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp, kgid_t gid) { u32 id = from_kgid(&init_user_ns, gid); return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id); } |
9 9 9 5 154 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | /* * Cryptographic API. * * SHA-256, as specified in * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf * * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. * * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <crypto/sha.h> #include <crypto/sha256_base.h> #include <asm/byteorder.h> #include <asm/unaligned.h> const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE] = { 0xd1, 0x4a, 0x02, 0x8c, 0x2a, 0x3a, 0x2b, 0xc9, 0x47, 0x61, 0x02, 0xbb, 0x28, 0x82, 0x34, 0xc4, 0x15, 0xa2, 0xb0, 0x1f, 0x82, 0x8e, 0xa6, 0x2a, 0xc5, 0xb3, 0xe4, 0x2f }; EXPORT_SYMBOL_GPL(sha224_zero_message_hash); const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE] = { 0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55 }; EXPORT_SYMBOL_GPL(sha256_zero_message_hash); static inline u32 Ch(u32 x, u32 y, u32 z) { return z ^ (x & (y ^ z)); } static inline u32 Maj(u32 x, u32 y, u32 z) { return (x & y) | (z & (x | y)); } #define e0(x) (ror32(x, 2) ^ ror32(x,13) ^ ror32(x,22)) #define e1(x) (ror32(x, 6) ^ ror32(x,11) ^ ror32(x,25)) #define s0(x) (ror32(x, 7) ^ ror32(x,18) ^ (x >> 3)) #define s1(x) (ror32(x,17) ^ ror32(x,19) ^ (x >> 10)) static inline void LOAD_OP(int I, u32 *W, const u8 *input) { W[I] = get_unaligned_be32((__u32 *)input + I); } static inline void BLEND_OP(int I, u32 *W) { W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; } static void sha256_transform(u32 *state, const u8 *input) { u32 a, b, c, d, e, f, g, h, t1, t2; u32 W[64]; int i; /* load the input */ for (i = 0; i < 16; i++) LOAD_OP(i, W, input); /* now blend */ for (i = 16; i < 64; i++) BLEND_OP(i, W); /* load the state into our registers */ a=state[0]; b=state[1]; c=state[2]; d=state[3]; e=state[4]; f=state[5]; g=state[6]; h=state[7]; /* now iterate */ t1 = h + e1(e) + Ch(e,f,g) + 0x428a2f98 + W[ 0]; t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; t1 = g + e1(d) + Ch(d,e,f) + 0x71374491 + W[ 1]; t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; t1 = f + e1(c) + Ch(c,d,e) + 0xb5c0fbcf + W[ 2]; t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; t1 = e + e1(b) + Ch(b,c,d) + 0xe9b5dba5 + W[ 3]; t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; t1 = d + e1(a) + Ch(a,b,c) + 0x3956c25b + W[ 4]; t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; t1 = c + e1(h) + Ch(h,a,b) + 0x59f111f1 + W[ 5]; t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; t1 = b + e1(g) + Ch(g,h,a) + 0x923f82a4 + W[ 6]; t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; t1 = a + e1(f) + Ch(f,g,h) + 0xab1c5ed5 + W[ 7]; t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; t1 = h + e1(e) + Ch(e,f,g) + 0xd807aa98 + W[ 8]; t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; t1 = g + e1(d) + Ch(d,e,f) + 0x12835b01 + W[ 9]; t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; t1 = f + e1(c) + Ch(c,d,e) + 0x243185be + W[10]; t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; t1 = e + e1(b) + Ch(b,c,d) + 0x550c7dc3 + W[11]; t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; t1 = d + e1(a) + Ch(a,b,c) + 0x72be5d74 + W[12]; t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; t1 = c + e1(h) + Ch(h,a,b) + 0x80deb1fe + W[13]; t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; t1 = b + e1(g) + Ch(g,h,a) + 0x9bdc06a7 + W[14]; t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; t1 = a + e1(f) + Ch(f,g,h) + 0xc19bf174 + W[15]; t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; t1 = h + e1(e) + Ch(e,f,g) + 0xe49b69c1 + W[16]; t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; t1 = g + e1(d) + Ch(d,e,f) + 0xefbe4786 + W[17]; t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; t1 = f + e1(c) + Ch(c,d,e) + 0x0fc19dc6 + W[18]; t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; t1 = e + e1(b) + Ch(b,c,d) + 0x240ca1cc + W[19]; t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; t1 = d + e1(a) + Ch(a,b,c) + 0x2de92c6f + W[20]; t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; t1 = c + e1(h) + Ch(h,a,b) + 0x4a7484aa + W[21]; t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; t1 = b + e1(g) + Ch(g,h,a) + 0x5cb0a9dc + W[22]; t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; t1 = a + e1(f) + Ch(f,g,h) + 0x76f988da + W[23]; t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; t1 = h + e1(e) + Ch(e,f,g) + 0x983e5152 + W[24]; t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; t1 = g + e1(d) + Ch(d,e,f) + 0xa831c66d + W[25]; t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; t1 = f + e1(c) + Ch(c,d,e) + 0xb00327c8 + W[26]; t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; t1 = e + e1(b) + Ch(b,c,d) + 0xbf597fc7 + W[27]; t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; t1 = d + e1(a) + Ch(a,b,c) + 0xc6e00bf3 + W[28]; t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; t1 = c + e1(h) + Ch(h,a,b) + 0xd5a79147 + W[29]; t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; t1 = b + e1(g) + Ch(g,h,a) + 0x06ca6351 + W[30]; t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; t1 = a + e1(f) + Ch(f,g,h) + 0x14292967 + W[31]; t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; t1 = h + e1(e) + Ch(e,f,g) + 0x27b70a85 + W[32]; t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; t1 = g + e1(d) + Ch(d,e,f) + 0x2e1b2138 + W[33]; t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; t1 = f + e1(c) + Ch(c,d,e) + 0x4d2c6dfc + W[34]; t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; t1 = e + e1(b) + Ch(b,c,d) + 0x53380d13 + W[35]; t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; t1 = d + e1(a) + Ch(a,b,c) + 0x650a7354 + W[36]; t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; t1 = c + e1(h) + Ch(h,a,b) + 0x766a0abb + W[37]; t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; t1 = b + e1(g) + Ch(g,h,a) + 0x81c2c92e + W[38]; t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; t1 = a + e1(f) + Ch(f,g,h) + 0x92722c85 + W[39]; t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; t1 = h + e1(e) + Ch(e,f,g) + 0xa2bfe8a1 + W[40]; t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; t1 = g + e1(d) + Ch(d,e,f) + 0xa81a664b + W[41]; t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; t1 = f + e1(c) + Ch(c,d,e) + 0xc24b8b70 + W[42]; t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; t1 = e + e1(b) + Ch(b,c,d) + 0xc76c51a3 + W[43]; t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; t1 = d + e1(a) + Ch(a,b,c) + 0xd192e819 + W[44]; t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; t1 = c + e1(h) + Ch(h,a,b) + 0xd6990624 + W[45]; t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; t1 = b + e1(g) + Ch(g,h,a) + 0xf40e3585 + W[46]; t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; t1 = a + e1(f) + Ch(f,g,h) + 0x106aa070 + W[47]; t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; t1 = h + e1(e) + Ch(e,f,g) + 0x19a4c116 + W[48]; t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; t1 = g + e1(d) + Ch(d,e,f) + 0x1e376c08 + W[49]; t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; t1 = f + e1(c) + Ch(c,d,e) + 0x2748774c + W[50]; t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; t1 = e + e1(b) + Ch(b,c,d) + 0x34b0bcb5 + W[51]; t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; t1 = d + e1(a) + Ch(a,b,c) + 0x391c0cb3 + W[52]; t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; t1 = c + e1(h) + Ch(h,a,b) + 0x4ed8aa4a + W[53]; t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; t1 = b + e1(g) + Ch(g,h,a) + 0x5b9cca4f + W[54]; t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; t1 = a + e1(f) + Ch(f,g,h) + 0x682e6ff3 + W[55]; t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; t1 = h + e1(e) + Ch(e,f,g) + 0x748f82ee + W[56]; t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; t1 = g + e1(d) + Ch(d,e,f) + 0x78a5636f + W[57]; t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; t1 = f + e1(c) + Ch(c,d,e) + 0x84c87814 + W[58]; t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; t1 = e + e1(b) + Ch(b,c,d) + 0x8cc70208 + W[59]; t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; t1 = d + e1(a) + Ch(a,b,c) + 0x90befffa + W[60]; t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; t1 = c + e1(h) + Ch(h,a,b) + 0xa4506ceb + W[61]; t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; t1 = b + e1(g) + Ch(g,h,a) + 0xbef9a3f7 + W[62]; t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; t1 = a + e1(f) + Ch(f,g,h) + 0xc67178f2 + W[63]; t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; state[5] += f; state[6] += g; state[7] += h; /* clear any sensitive info... */ a = b = c = d = e = f = g = h = t1 = t2 = 0; memzero_explicit(W, 64 * sizeof(u32)); } static void sha256_generic_block_fn(struct sha256_state *sst, u8 const *src, int blocks) { while (blocks--) { sha256_transform(sst->state, src); src += SHA256_BLOCK_SIZE; } } int crypto_sha256_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha256_base_do_update(desc, data, len, sha256_generic_block_fn); } EXPORT_SYMBOL(crypto_sha256_update); static int sha256_final(struct shash_desc *desc, u8 *out) { sha256_base_do_finalize(desc, sha256_generic_block_fn); return sha256_base_finish(desc, out); } int crypto_sha256_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *hash) { sha256_base_do_update(desc, data, len, sha256_generic_block_fn); return sha256_final(desc, hash); } EXPORT_SYMBOL(crypto_sha256_finup); static struct shash_alg sha256_algs[2] = { { .digestsize = SHA256_DIGEST_SIZE, .init = sha256_base_init, .update = crypto_sha256_update, .final = sha256_final, .finup = crypto_sha256_finup, .descsize = sizeof(struct sha256_state), .base = { .cra_name = "sha256", .cra_driver_name= "sha256-generic", .cra_priority = 100, .cra_blocksize = SHA256_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA224_DIGEST_SIZE, .init = sha224_base_init, .update = crypto_sha256_update, .final = sha256_final, .finup = crypto_sha256_finup, .descsize = sizeof(struct sha256_state), .base = { .cra_name = "sha224", .cra_driver_name= "sha224-generic", .cra_priority = 100, .cra_blocksize = SHA224_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int __init sha256_generic_mod_init(void) { return crypto_register_shashes(sha256_algs, ARRAY_SIZE(sha256_algs)); } static void __exit sha256_generic_mod_fini(void) { crypto_unregister_shashes(sha256_algs, ARRAY_SIZE(sha256_algs)); } module_init(sha256_generic_mod_init); module_exit(sha256_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm"); MODULE_ALIAS_CRYPTO("sha224"); MODULE_ALIAS_CRYPTO("sha224-generic"); MODULE_ALIAS_CRYPTO("sha256"); MODULE_ALIAS_CRYPTO("sha256-generic"); |
9 9 9 6 6 6 1 1 3 2 3 3 3 14 4 3 3 6 6 14 23 26 24 17 9 9 7 7 7 7 3 12 3 26 12 12 10 9 4 1 2 1 7 7 8 7 4 8 4 4 3 3 3 4 2 1 2 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 | /* * File: pep.c * * Phonet pipe protocol end point socket * * Copyright (C) 2008 Nokia Corporation. * * Author: Rémi Denis-Courmont * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA */ #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/socket.h> #include <net/sock.h> #include <net/tcp_states.h> #include <asm/ioctls.h> #include <linux/phonet.h> #include <linux/module.h> #include <net/phonet/phonet.h> #include <net/phonet/pep.h> #include <net/phonet/gprs.h> /* sk_state values: * TCP_CLOSE sock not in use yet * TCP_CLOSE_WAIT disconnected pipe * TCP_LISTEN listening pipe endpoint * TCP_SYN_RECV connected pipe in disabled state * TCP_ESTABLISHED connected pipe in enabled state * * pep_sock locking: * - sk_state, hlist: sock lock needed * - listener: read only * - pipe_handle: read only */ #define CREDITS_MAX 10 #define CREDITS_THR 7 #define pep_sb_size(s) (((s) + 5) & ~3) /* 2-bytes head, 32-bits aligned */ /* Get the next TLV sub-block. */ static unsigned char *pep_get_sb(struct sk_buff *skb, u8 *ptype, u8 *plen, void *buf) { void *data = NULL; struct { u8 sb_type; u8 sb_len; } *ph, h; int buflen = *plen; ph = skb_header_pointer(skb, 0, 2, &h); if (ph == NULL || ph->sb_len < 2 || !pskb_may_pull(skb, ph->sb_len)) return NULL; ph->sb_len -= 2; *ptype = ph->sb_type; *plen = ph->sb_len; if (buflen > ph->sb_len) buflen = ph->sb_len; data = skb_header_pointer(skb, 2, buflen, buf); __skb_pull(skb, 2 + ph->sb_len); return data; } static struct sk_buff *pep_alloc_skb(struct sock *sk, const void *payload, int len, gfp_t priority) { struct sk_buff *skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority); if (!skb) return NULL; skb_set_owner_w(skb, sk); skb_reserve(skb, MAX_PNPIPE_HEADER); __skb_put(skb, len); skb_copy_to_linear_data(skb, payload, len); __skb_push(skb, sizeof(struct pnpipehdr)); skb_reset_transport_header(skb); return skb; } static int pep_reply(struct sock *sk, struct sk_buff *oskb, u8 code, const void *data, int len, gfp_t priority) { const struct pnpipehdr *oph = pnp_hdr(oskb); struct pnpipehdr *ph; struct sk_buff *skb; struct sockaddr_pn peer; skb = pep_alloc_skb(sk, data, len, priority); if (!skb) return -ENOMEM; ph = pnp_hdr(skb); ph->utid = oph->utid; ph->message_id = oph->message_id + 1; /* REQ -> RESP */ ph->pipe_handle = oph->pipe_handle; ph->error_code = code; pn_skb_get_src_sockaddr(oskb, &peer); return pn_skb_send(sk, skb, &peer); } static int pep_indicate(struct sock *sk, u8 id, u8 code, const void *data, int len, gfp_t priority) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; struct sk_buff *skb; skb = pep_alloc_skb(sk, data, len, priority); if (!skb) return -ENOMEM; ph = pnp_hdr(skb); ph->utid = 0; ph->message_id = id; ph->pipe_handle = pn->pipe_handle; ph->error_code = code; return pn_skb_send(sk, skb, NULL); } #define PAD 0x00 static int pipe_handler_request(struct sock *sk, u8 id, u8 code, const void *data, int len) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; struct sk_buff *skb; skb = pep_alloc_skb(sk, data, len, GFP_KERNEL); if (!skb) return -ENOMEM; ph = pnp_hdr(skb); ph->utid = id; /* whatever */ ph->message_id = id; ph->pipe_handle = pn->pipe_handle; ph->error_code = code; return pn_skb_send(sk, skb, NULL); } static int pipe_handler_send_created_ind(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); u8 data[4] = { PN_PIPE_SB_NEGOTIATED_FC, pep_sb_size(2), pn->tx_fc, pn->rx_fc, }; return pep_indicate(sk, PNS_PIPE_CREATED_IND, 1 /* sub-blocks */, data, 4, GFP_ATOMIC); } static int pep_accept_conn(struct sock *sk, struct sk_buff *skb) { static const u8 data[20] = { PAD, PAD, PAD, 2 /* sub-blocks */, PN_PIPE_SB_REQUIRED_FC_TX, pep_sb_size(5), 3, PAD, PN_MULTI_CREDIT_FLOW_CONTROL, PN_ONE_CREDIT_FLOW_CONTROL, PN_LEGACY_FLOW_CONTROL, PAD, PN_PIPE_SB_PREFERRED_FC_RX, pep_sb_size(5), 3, PAD, PN_MULTI_CREDIT_FLOW_CONTROL, PN_ONE_CREDIT_FLOW_CONTROL, PN_LEGACY_FLOW_CONTROL, PAD, }; might_sleep(); return pep_reply(sk, skb, PN_PIPE_NO_ERROR, data, sizeof(data), GFP_KERNEL); } static int pep_reject_conn(struct sock *sk, struct sk_buff *skb, u8 code, gfp_t priority) { static const u8 data[4] = { PAD, PAD, PAD, 0 /* sub-blocks */ }; WARN_ON(code == PN_PIPE_NO_ERROR); return pep_reply(sk, skb, code, data, sizeof(data), priority); } /* Control requests are not sent by the pipe service and have a specific * message format. */ static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code, gfp_t priority) { const struct pnpipehdr *oph = pnp_hdr(oskb); struct sk_buff *skb; struct pnpipehdr *ph; struct sockaddr_pn dst; u8 data[4] = { oph->pep_type, /* PEP type */ code, /* error code, at an unusual offset */ PAD, PAD, }; skb = pep_alloc_skb(sk, data, 4, priority); if (!skb) return -ENOMEM; ph = pnp_hdr(skb); ph->utid = oph->utid; ph->message_id = PNS_PEP_CTRL_RESP; ph->pipe_handle = oph->pipe_handle; ph->data0 = oph->data[0]; /* CTRL id */ pn_skb_get_src_sockaddr(oskb, &dst); return pn_skb_send(sk, skb, &dst); } static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority) { u8 data[4] = { type, PAD, PAD, status }; return pep_indicate(sk, PNS_PEP_STATUS_IND, PN_PEP_TYPE_COMMON, data, 4, priority); } /* Send our RX flow control information to the sender. * Socket must be locked. */ static void pipe_grant_credits(struct sock *sk, gfp_t priority) { struct pep_sock *pn = pep_sk(sk); BUG_ON(sk->sk_state != TCP_ESTABLISHED); switch (pn->rx_fc) { case PN_LEGACY_FLOW_CONTROL: /* TODO */ break; case PN_ONE_CREDIT_FLOW_CONTROL: if (pipe_snd_status(sk, PN_PEP_IND_FLOW_CONTROL, PEP_IND_READY, priority) == 0) pn->rx_credits = 1; break; case PN_MULTI_CREDIT_FLOW_CONTROL: if ((pn->rx_credits + CREDITS_THR) > CREDITS_MAX) break; if (pipe_snd_status(sk, PN_PEP_IND_ID_MCFC_GRANT_CREDITS, CREDITS_MAX - pn->rx_credits, priority) == 0) pn->rx_credits = CREDITS_MAX; break; } } static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *hdr; int wake = 0; if (!pskb_may_pull(skb, sizeof(*hdr) + 4)) return -EINVAL; hdr = pnp_hdr(skb); if (hdr->pep_type != PN_PEP_TYPE_COMMON) { net_dbg_ratelimited("Phonet unknown PEP type: %u\n", (unsigned int)hdr->pep_type); return -EOPNOTSUPP; } switch (hdr->data[0]) { case PN_PEP_IND_FLOW_CONTROL: switch (pn->tx_fc) { case PN_LEGACY_FLOW_CONTROL: switch (hdr->data[3]) { case PEP_IND_BUSY: atomic_set(&pn->tx_credits, 0); break; case PEP_IND_READY: atomic_set(&pn->tx_credits, wake = 1); break; } break; case PN_ONE_CREDIT_FLOW_CONTROL: if (hdr->data[3] == PEP_IND_READY) atomic_set(&pn->tx_credits, wake = 1); break; } break; case PN_PEP_IND_ID_MCFC_GRANT_CREDITS: if (pn->tx_fc != PN_MULTI_CREDIT_FLOW_CONTROL) break; atomic_add(wake = hdr->data[3], &pn->tx_credits); break; default: net_dbg_ratelimited("Phonet unknown PEP indication: %u\n", (unsigned int)hdr->data[0]); return -EOPNOTSUPP; } if (wake) sk->sk_write_space(sk); return 0; } static int pipe_rcv_created(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *hdr = pnp_hdr(skb); u8 n_sb = hdr->data0; pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL; __skb_pull(skb, sizeof(*hdr)); while (n_sb > 0) { u8 type, buf[2], len = sizeof(buf); u8 *data = pep_get_sb(skb, &type, &len, buf); if (data == NULL) return -EINVAL; switch (type) { case PN_PIPE_SB_NEGOTIATED_FC: if (len < 2 || (data[0] | data[1]) > 3) break; pn->tx_fc = data[0] & 3; pn->rx_fc = data[1] & 3; break; } n_sb--; } return 0; } /* Queue an skb to a connected sock. * Socket lock must be held. */ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *hdr = pnp_hdr(skb); struct sk_buff_head *queue; int err = 0; BUG_ON(sk->sk_state == TCP_CLOSE_WAIT); switch (hdr->message_id) { case PNS_PEP_CONNECT_REQ: pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_ATOMIC); break; case PNS_PEP_DISCONNECT_REQ: pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); sk->sk_state = TCP_CLOSE_WAIT; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); break; case PNS_PEP_ENABLE_REQ: /* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */ pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); break; case PNS_PEP_RESET_REQ: switch (hdr->state_after_reset) { case PN_PIPE_DISABLE: pn->init_enable = 0; break; case PN_PIPE_ENABLE: pn->init_enable = 1; break; default: /* not allowed to send an error here!? */ err = -EINVAL; goto out; } /* fall through */ case PNS_PEP_DISABLE_REQ: atomic_set(&pn->tx_credits, 0); pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); break; case PNS_PEP_CTRL_REQ: if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) { atomic_inc(&sk->sk_drops); break; } __skb_pull(skb, 4); queue = &pn->ctrlreq_queue; goto queue; case PNS_PIPE_ALIGNED_DATA: __skb_pull(skb, 1); /* fall through */ case PNS_PIPE_DATA: __skb_pull(skb, 3); /* Pipe data header */ if (!pn_flow_safe(pn->rx_fc)) { err = sock_queue_rcv_skb(sk, skb); if (!err) return NET_RX_SUCCESS; err = -ENOBUFS; break; } if (pn->rx_credits == 0) { atomic_inc(&sk->sk_drops); err = -ENOBUFS; break; } pn->rx_credits--; queue = &sk->sk_receive_queue; goto queue; case PNS_PEP_STATUS_IND: pipe_rcv_status(sk, skb); break; case PNS_PIPE_REDIRECTED_IND: err = pipe_rcv_created(sk, skb); break; case PNS_PIPE_CREATED_IND: err = pipe_rcv_created(sk, skb); if (err) break; /* fall through */ case PNS_PIPE_RESET_IND: if (!pn->init_enable) break; /* fall through */ case PNS_PIPE_ENABLED_IND: if (!pn_flow_safe(pn->tx_fc)) { atomic_set(&pn->tx_credits, 1); sk->sk_write_space(sk); } if (sk->sk_state == TCP_ESTABLISHED) break; /* Nothing to do */ sk->sk_state = TCP_ESTABLISHED; pipe_grant_credits(sk, GFP_ATOMIC); break; case PNS_PIPE_DISABLED_IND: sk->sk_state = TCP_SYN_RECV; pn->rx_credits = 0; break; default: net_dbg_ratelimited("Phonet unknown PEP message: %u\n", hdr->message_id); err = -EINVAL; } out: kfree_skb(skb); return (err == -ENOBUFS) ? NET_RX_DROP : NET_RX_SUCCESS; queue: skb->dev = NULL; skb_set_owner_r(skb, sk); skb_queue_tail(queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return NET_RX_SUCCESS; } /* Destroy connected sock. */ static void pipe_destruct(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&pn->ctrlreq_queue); } static u8 pipe_negotiate_fc(const u8 *fcs, unsigned int n) { unsigned int i; u8 final_fc = PN_NO_FLOW_CONTROL; for (i = 0; i < n; i++) { u8 fc = fcs[i]; if (fc > final_fc && fc < PN_MAX_FLOW_CONTROL) final_fc = fc; } return final_fc; } static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *hdr; u8 n_sb; if (!pskb_pull(skb, sizeof(*hdr) + 4)) return -EINVAL; hdr = pnp_hdr(skb); if (hdr->error_code != PN_PIPE_NO_ERROR) return -ECONNREFUSED; /* Parse sub-blocks */ n_sb = hdr->data[3]; while (n_sb > 0) { u8 type, buf[6], len = sizeof(buf); const u8 *data = pep_get_sb(skb, &type, &len, buf); if (data == NULL) return -EINVAL; switch (type) { case PN_PIPE_SB_REQUIRED_FC_TX: if (len < 2 || len < data[0]) break; pn->tx_fc = pipe_negotiate_fc(data + 2, len - 2); break; case PN_PIPE_SB_PREFERRED_FC_RX: if (len < 2 || len < data[0]) break; pn->rx_fc = pipe_negotiate_fc(data + 2, len - 2); break; } n_sb--; } return pipe_handler_send_created_ind(sk); } static int pep_enableresp_rcv(struct sock *sk, struct sk_buff *skb) { struct pnpipehdr *hdr = pnp_hdr(skb); if (hdr->error_code != PN_PIPE_NO_ERROR) return -ECONNREFUSED; return pep_indicate(sk, PNS_PIPE_ENABLED_IND, 0 /* sub-blocks */, NULL, 0, GFP_ATOMIC); } static void pipe_start_flow_control(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); if (!pn_flow_safe(pn->tx_fc)) { atomic_set(&pn->tx_credits, 1); sk->sk_write_space(sk); } pipe_grant_credits(sk, GFP_ATOMIC); } /* Queue an skb to an actively connected sock. * Socket lock must be held. */ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *hdr = pnp_hdr(skb); int err = NET_RX_SUCCESS; switch (hdr->message_id) { case PNS_PIPE_ALIGNED_DATA: __skb_pull(skb, 1); /* fall through */ case PNS_PIPE_DATA: __skb_pull(skb, 3); /* Pipe data header */ if (!pn_flow_safe(pn->rx_fc)) { err = sock_queue_rcv_skb(sk, skb); if (!err) return NET_RX_SUCCESS; err = NET_RX_DROP; break; } if (pn->rx_credits == 0) { atomic_inc(&sk->sk_drops); err = NET_RX_DROP; break; } pn->rx_credits--; skb->dev = NULL; skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return NET_RX_SUCCESS; case PNS_PEP_CONNECT_RESP: if (sk->sk_state != TCP_SYN_SENT) break; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); if (pep_connresp_rcv(sk, skb)) { sk->sk_state = TCP_CLOSE_WAIT; break; } if (pn->init_enable == PN_PIPE_DISABLE) sk->sk_state = TCP_SYN_RECV; else { sk->sk_state = TCP_ESTABLISHED; pipe_start_flow_control(sk); } break; case PNS_PEP_ENABLE_RESP: if (sk->sk_state != TCP_SYN_SENT) break; if (pep_enableresp_rcv(sk, skb)) { sk->sk_state = TCP_CLOSE_WAIT; break; } sk->sk_state = TCP_ESTABLISHED; pipe_start_flow_control(sk); break; case PNS_PEP_DISCONNECT_RESP: /* sock should already be dead, nothing to do */ break; case PNS_PEP_STATUS_IND: pipe_rcv_status(sk, skb); break; } kfree_skb(skb); return err; } /* Listening sock must be locked */ static struct sock *pep_find_pipe(const struct hlist_head *hlist, const struct sockaddr_pn *dst, u8 pipe_handle) { struct sock *sknode; u16 dobj = pn_sockaddr_get_object(dst); sk_for_each(sknode, hlist) { struct pep_sock *pnnode = pep_sk(sknode); /* Ports match, but addresses might not: */ if (pnnode->pn_sk.sobject != dobj) continue; if (pnnode->pipe_handle != pipe_handle) continue; if (sknode->sk_state == TCP_CLOSE_WAIT) continue; sock_hold(sknode); return sknode; } return NULL; } /* * Deliver an skb to a listening sock. * Socket lock must be held. * We then queue the skb to the right connected sock (if any). */ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct sock *sknode; struct pnpipehdr *hdr; struct sockaddr_pn dst; u8 pipe_handle; if (!pskb_may_pull(skb, sizeof(*hdr))) goto drop; hdr = pnp_hdr(skb); pipe_handle = hdr->pipe_handle; if (pipe_handle == PN_PIPE_INVALID_HANDLE) goto drop; pn_skb_get_dst_sockaddr(skb, &dst); /* Look for an existing pipe handle */ sknode = pep_find_pipe(&pn->hlist, &dst, pipe_handle); if (sknode) return sk_receive_skb(sknode, skb, 1); switch (hdr->message_id) { case PNS_PEP_CONNECT_REQ: if (sk->sk_state != TCP_LISTEN || sk_acceptq_is_full(sk)) { pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_ATOMIC); break; } skb_queue_head(&sk->sk_receive_queue, skb); sk_acceptq_added(sk); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return NET_RX_SUCCESS; case PNS_PEP_DISCONNECT_REQ: pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); break; case PNS_PEP_CTRL_REQ: pep_ctrlreq_error(sk, skb, PN_PIPE_INVALID_HANDLE, GFP_ATOMIC); break; case PNS_PEP_RESET_REQ: case PNS_PEP_ENABLE_REQ: case PNS_PEP_DISABLE_REQ: /* invalid handle is not even allowed here! */ break; default: if ((1 << sk->sk_state) & ~(TCPF_CLOSE|TCPF_LISTEN|TCPF_CLOSE_WAIT)) /* actively connected socket */ return pipe_handler_do_rcv(sk, skb); } drop: kfree_skb(skb); return NET_RX_SUCCESS; } static int pipe_do_remove(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; struct sk_buff *skb; skb = pep_alloc_skb(sk, NULL, 0, GFP_KERNEL); if (!skb) return -ENOMEM; ph = pnp_hdr(skb); ph->utid = 0; ph->message_id = PNS_PIPE_REMOVE_REQ; ph->pipe_handle = pn->pipe_handle; ph->data0 = PAD; return pn_skb_send(sk, skb, NULL); } /* associated socket ceases to exist */ static void pep_sock_close(struct sock *sk, long timeout) { struct pep_sock *pn = pep_sk(sk); int ifindex = 0; sock_hold(sk); /* keep a reference after sk_common_release() */ sk_common_release(sk); lock_sock(sk); if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED)) { if (sk->sk_backlog_rcv == pipe_do_rcv) /* Forcefully remove dangling Phonet pipe */ pipe_do_remove(sk); else pipe_handler_request(sk, PNS_PEP_DISCONNECT_REQ, PAD, NULL, 0); } sk->sk_state = TCP_CLOSE; ifindex = pn->ifindex; pn->ifindex = 0; release_sock(sk); if (ifindex) gprs_detach(sk); sock_put(sk); } static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp, bool kern) { struct pep_sock *pn = pep_sk(sk), *newpn; struct sock *newsk = NULL; struct sk_buff *skb; struct pnpipehdr *hdr; struct sockaddr_pn dst, src; int err; u16 peer_type; u8 pipe_handle, enabled, n_sb; u8 aligned = 0; skb = skb_recv_datagram(sk, 0, flags & O_NONBLOCK, errp); if (!skb) return NULL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN) { err = -EINVAL; goto drop; } sk_acceptq_removed(sk); err = -EPROTO; if (!pskb_may_pull(skb, sizeof(*hdr) + 4)) goto drop; hdr = pnp_hdr(skb); pipe_handle = hdr->pipe_handle; switch (hdr->state_after_connect) { case PN_PIPE_DISABLE: enabled = 0; break; case PN_PIPE_ENABLE: enabled = 1; break; default: pep_reject_conn(sk, skb, PN_PIPE_ERR_INVALID_PARAM, GFP_KERNEL); goto drop; } peer_type = hdr->other_pep_type << 8; /* Parse sub-blocks (options) */ n_sb = hdr->data[3]; while (n_sb > 0) { u8 type, buf[1], len = sizeof(buf); const u8 *data = pep_get_sb(skb, &type, &len, buf); if (data == NULL) goto drop; switch (type) { case PN_PIPE_SB_CONNECT_REQ_PEP_SUB_TYPE: if (len < 1) goto drop; peer_type = (peer_type & 0xff00) | data[0]; break; case PN_PIPE_SB_ALIGNED_DATA: aligned = data[0] != 0; break; } n_sb--; } /* Check for duplicate pipe handle */ newsk = pep_find_pipe(&pn->hlist, &dst, pipe_handle); if (unlikely(newsk)) { __sock_put(newsk); newsk = NULL; pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE, GFP_KERNEL); goto drop; } /* Create a new to-be-accepted sock */ newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, kern); if (!newsk) { pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL); err = -ENOBUFS; goto drop; } sock_init_data(NULL, newsk); newsk->sk_state = TCP_SYN_RECV; newsk->sk_backlog_rcv = pipe_do_rcv; newsk->sk_protocol = sk->sk_protocol; newsk->sk_destruct = pipe_destruct; newpn = pep_sk(newsk); pn_skb_get_dst_sockaddr(skb, &dst); pn_skb_get_src_sockaddr(skb, &src); newpn->pn_sk.sobject = pn_sockaddr_get_object(&dst); newpn->pn_sk.dobject = pn_sockaddr_get_object(&src); newpn->pn_sk.resource = pn_sockaddr_get_resource(&dst); sock_hold(sk); newpn->listener = sk; skb_queue_head_init(&newpn->ctrlreq_queue); newpn->pipe_handle = pipe_handle; atomic_set(&newpn->tx_credits, 0); newpn->ifindex = 0; newpn->peer_type = peer_type; newpn->rx_credits = 0; newpn->rx_fc = newpn->tx_fc = PN_LEGACY_FLOW_CONTROL; newpn->init_enable = enabled; newpn->aligned = aligned; err = pep_accept_conn(newsk, skb); if (err) { __sock_put(sk); sock_put(newsk); newsk = NULL; goto drop; } sk_add_node(newsk, &pn->hlist); drop: release_sock(sk); kfree_skb(skb); *errp = err; return newsk; } static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len) { struct pep_sock *pn = pep_sk(sk); int err; u8 data[4] = { 0 /* sub-blocks */, PAD, PAD, PAD }; if (pn->pipe_handle == PN_PIPE_INVALID_HANDLE) pn->pipe_handle = 1; /* anything but INVALID_HANDLE */ err = pipe_handler_request(sk, PNS_PEP_CONNECT_REQ, pn->init_enable, data, 4); if (err) { pn->pipe_handle = PN_PIPE_INVALID_HANDLE; return err; } sk->sk_state = TCP_SYN_SENT; return 0; } static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len) { int err; err = pipe_handler_request(sk, PNS_PEP_ENABLE_REQ, PAD, NULL, 0); if (err) return err; sk->sk_state = TCP_SYN_SENT; return 0; } static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg) { struct pep_sock *pn = pep_sk(sk); int answ; int ret = -ENOIOCTLCMD; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) { ret = -EINVAL; break; } lock_sock(sk); if (sock_flag(sk, SOCK_URGINLINE) && !skb_queue_empty(&pn->ctrlreq_queue)) answ = skb_peek(&pn->ctrlreq_queue)->len; else if (!skb_queue_empty(&sk->sk_receive_queue)) answ = skb_peek(&sk->sk_receive_queue)->len; else answ = 0; release_sock(sk); ret = put_user(answ, (int __user *)arg); break; case SIOCPNENABLEPIPE: lock_sock(sk); if (sk->sk_state == TCP_SYN_SENT) ret = -EBUSY; else if (sk->sk_state == TCP_ESTABLISHED) ret = -EISCONN; else if (!pn->pn_sk.sobject) ret = -EADDRNOTAVAIL; else ret = pep_sock_enable(sk, NULL, 0); release_sock(sk); break; } return ret; } static int pep_init(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); sk->sk_destruct = pipe_destruct; INIT_HLIST_HEAD(&pn->hlist); pn->listener = NULL; skb_queue_head_init(&pn->ctrlreq_queue); atomic_set(&pn->tx_credits, 0); pn->ifindex = 0; pn->peer_type = 0; pn->pipe_handle = PN_PIPE_INVALID_HANDLE; pn->rx_credits = 0; pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL; pn->init_enable = 1; pn->aligned = 0; return 0; } static int pep_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct pep_sock *pn = pep_sk(sk); int val = 0, err = 0; if (level != SOL_PNPIPE) return -ENOPROTOOPT; if (optlen >= sizeof(int)) { if (get_user(val, (int __user *) optval)) return -EFAULT; } lock_sock(sk); switch (optname) { case PNPIPE_ENCAP: if (val && val != PNPIPE_ENCAP_IP) { err = -EINVAL; break; } if (!pn->ifindex == !val) break; /* Nothing to do! */ if (!capable(CAP_NET_ADMIN)) { err = -EPERM; break; } if (val) { release_sock(sk); err = gprs_attach(sk); if (err > 0) { pn->ifindex = err; err = 0; } } else { pn->ifindex = 0; release_sock(sk); gprs_detach(sk); err = 0; } goto out_norel; case PNPIPE_HANDLE: if ((sk->sk_state == TCP_CLOSE) && (val >= 0) && (val < PN_PIPE_INVALID_HANDLE)) pn->pipe_handle = val; else err = -EINVAL; break; case PNPIPE_INITSTATE: pn->init_enable = !!val; break; default: err = -ENOPROTOOPT; } release_sock(sk); out_norel: return err; } static int pep_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct pep_sock *pn = pep_sk(sk); int len, val; if (level != SOL_PNPIPE) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; switch (optname) { case PNPIPE_ENCAP: val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE; break; case PNPIPE_IFINDEX: val = pn->ifindex; break; case PNPIPE_HANDLE: val = pn->pipe_handle; if (val == PN_PIPE_INVALID_HANDLE) return -EINVAL; break; case PNPIPE_INITSTATE: val = pn->init_enable; break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); if (put_user(len, optlen)) return -EFAULT; if (put_user(val, (int __user *) optval)) return -EFAULT; return 0; } static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; int err; if (pn_flow_safe(pn->tx_fc) && !atomic_add_unless(&pn->tx_credits, -1, 0)) { kfree_skb(skb); return -ENOBUFS; } skb_push(skb, 3 + pn->aligned); skb_reset_transport_header(skb); ph = pnp_hdr(skb); ph->utid = 0; if (pn->aligned) { ph->message_id = PNS_PIPE_ALIGNED_DATA; ph->data0 = 0; /* padding */ } else ph->message_id = PNS_PIPE_DATA; ph->pipe_handle = pn->pipe_handle; err = pn_skb_send(sk, skb, NULL); if (err && pn_flow_safe(pn->tx_fc)) atomic_inc(&pn->tx_credits); return err; } static int pep_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct pep_sock *pn = pep_sk(sk); struct sk_buff *skb; long timeo; int flags = msg->msg_flags; int err, done; if (len > USHRT_MAX) return -EMSGSIZE; if ((msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL| MSG_CMSG_COMPAT)) || !(msg->msg_flags & MSG_EOR)) return -EOPNOTSUPP; skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len, flags & MSG_DONTWAIT, &err); if (!skb) return err; skb_reserve(skb, MAX_PHONET_HEADER + 3 + pn->aligned); err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err < 0) goto outfree; lock_sock(sk); timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if ((1 << sk->sk_state) & (TCPF_LISTEN|TCPF_CLOSE)) { err = -ENOTCONN; goto out; } if (sk->sk_state != TCP_ESTABLISHED) { /* Wait until the pipe gets to enabled state */ disabled: err = sk_stream_wait_connect(sk, &timeo); if (err) goto out; if (sk->sk_state == TCP_CLOSE_WAIT) { err = -ECONNRESET; goto out; } } BUG_ON(sk->sk_state != TCP_ESTABLISHED); /* Wait until flow control allows TX */ done = atomic_read(&pn->tx_credits); while (!done) { DEFINE_WAIT_FUNC(wait, woken_wake_function); if (!timeo) { err = -EAGAIN; goto out; } if (signal_pending(current)) { err = sock_intr_errno(timeo); goto out; } add_wait_queue(sk_sleep(sk), &wait); done = sk_wait_event(sk, &timeo, atomic_read(&pn->tx_credits), &wait); remove_wait_queue(sk_sleep(sk), &wait); if (sk->sk_state != TCP_ESTABLISHED) goto disabled; } err = pipe_skb_send(sk, skb); if (err >= 0) err = len; /* success! */ skb = NULL; out: release_sock(sk); outfree: kfree_skb(skb); return err; } int pep_writeable(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); return atomic_read(&pn->tx_credits); } int pep_write(struct sock *sk, struct sk_buff *skb) { struct sk_buff *rskb, *fs; int flen = 0; if (pep_sk(sk)->aligned) return pipe_skb_send(sk, skb); rskb = alloc_skb(MAX_PNPIPE_HEADER, GFP_ATOMIC); if (!rskb) { kfree_skb(skb); return -ENOMEM; } skb_shinfo(rskb)->frag_list = skb; rskb->len += skb->len; rskb->data_len += rskb->len; rskb->truesize += rskb->len; /* Avoid nested fragments */ skb_walk_frags(skb, fs) flen += fs->len; skb->next = skb_shinfo(skb)->frag_list; skb_frag_list_init(skb); skb->len -= flen; skb->data_len -= flen; skb->truesize -= flen; skb_reserve(rskb, MAX_PHONET_HEADER + 3); return pipe_skb_send(sk, rskb); } struct sk_buff *pep_read(struct sock *sk) { struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue); if (sk->sk_state == TCP_ESTABLISHED) pipe_grant_credits(sk, GFP_ATOMIC); return skb; } static int pep_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { struct sk_buff *skb; int err; if (flags & ~(MSG_OOB|MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_WAITALL| MSG_NOSIGNAL|MSG_CMSG_COMPAT)) return -EOPNOTSUPP; if (unlikely(1 << sk->sk_state & (TCPF_LISTEN | TCPF_CLOSE))) return -ENOTCONN; if ((flags & MSG_OOB) || sock_flag(sk, SOCK_URGINLINE)) { /* Dequeue and acknowledge control request */ struct pep_sock *pn = pep_sk(sk); if (flags & MSG_PEEK) return -EOPNOTSUPP; skb = skb_dequeue(&pn->ctrlreq_queue); if (skb) { pep_ctrlreq_error(sk, skb, PN_PIPE_NO_ERROR, GFP_KERNEL); msg->msg_flags |= MSG_OOB; goto copy; } if (flags & MSG_OOB) return -EINVAL; } skb = skb_recv_datagram(sk, flags, noblock, &err); lock_sock(sk); if (skb == NULL) { if (err == -ENOTCONN && sk->sk_state == TCP_CLOSE_WAIT) err = -ECONNRESET; release_sock(sk); return err; } if (sk->sk_state == TCP_ESTABLISHED) pipe_grant_credits(sk, GFP_KERNEL); release_sock(sk); copy: msg->msg_flags |= MSG_EOR; if (skb->len > len) msg->msg_flags |= MSG_TRUNC; else len = skb->len; err = skb_copy_datagram_msg(skb, 0, msg, len); if (!err) err = (flags & MSG_TRUNC) ? skb->len : len; skb_free_datagram(sk, skb); return err; } static void pep_sock_unhash(struct sock *sk) { struct pep_sock *pn = pep_sk(sk); struct sock *skparent = NULL; lock_sock(sk); if (pn->listener != NULL) { skparent = pn->listener; pn->listener = NULL; release_sock(sk); pn = pep_sk(skparent); lock_sock(skparent); sk_del_node_init(sk); sk = skparent; } /* Unhash a listening sock only when it is closed * and all of its active connected pipes are closed. */ if (hlist_empty(&pn->hlist)) pn_sock_unhash(&pn->pn_sk.sk); release_sock(sk); if (skparent) sock_put(skparent); } static struct proto pep_proto = { .close = pep_sock_close, .accept = pep_sock_accept, .connect = pep_sock_connect, .ioctl = pep_ioctl, .init = pep_init, .setsockopt = pep_setsockopt, .getsockopt = pep_getsockopt, .sendmsg = pep_sendmsg, .recvmsg = pep_recvmsg, .backlog_rcv = pep_do_rcv, .hash = pn_sock_hash, .unhash = pep_sock_unhash, .get_port = pn_sock_get_port, .obj_size = sizeof(struct pep_sock), .owner = THIS_MODULE, .name = "PNPIPE", }; static const struct phonet_protocol pep_pn_proto = { .ops = &phonet_stream_ops, .prot = &pep_proto, .sock_type = SOCK_SEQPACKET, }; static int __init pep_register(void) { return phonet_proto_register(PN_PROTO_PIPE, &pep_pn_proto); } static void __exit pep_unregister(void) { phonet_proto_unregister(PN_PROTO_PIPE, &pep_pn_proto); } module_init(pep_register); module_exit(pep_unregister); MODULE_AUTHOR("Remi Denis-Courmont, Nokia"); MODULE_DESCRIPTION("Phonet pipe protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_PHONET, PN_PROTO_PIPE); |
85 104 84 84 70 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | /* * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project. * * Copyright (c) 2001-2005 Anton Altaparmakov * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program/include file is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program (in the main directory of the Linux-NTFS * distribution in the file COPYING); if not, write to the Free Software * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _LINUX_NTFS_MALLOC_H #define _LINUX_NTFS_MALLOC_H #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/highmem.h> /** * __ntfs_malloc - allocate memory in multiples of pages * @size: number of bytes to allocate * @gfp_mask: extra flags for the allocator * * Internal function. You probably want ntfs_malloc_nofs()... * * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and * returns a pointer to the allocated memory. * * If there was insufficient memory to complete the request, return NULL. * Depending on @gfp_mask the allocation may be guaranteed to succeed. */ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) { if (likely(size <= PAGE_SIZE)) { BUG_ON(!size); /* kmalloc() has per-CPU caches so is faster for now. */ return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); /* return (void *)__get_free_page(gfp_mask); */ } if (likely((size >> PAGE_SHIFT) < totalram_pages)) return __vmalloc(size, gfp_mask, PAGE_KERNEL); return NULL; } /** * ntfs_malloc_nofs - allocate memory in multiples of pages * @size: number of bytes to allocate * * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and * returns a pointer to the allocated memory. * * If there was insufficient memory to complete the request, return NULL. */ static inline void *ntfs_malloc_nofs(unsigned long size) { return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM); } /** * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages * @size: number of bytes to allocate * * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and * returns a pointer to the allocated memory. * * This function guarantees that the allocation will succeed. It will sleep * for as long as it takes to complete the allocation. * * If there was insufficient memory to complete the request, return NULL. */ static inline void *ntfs_malloc_nofs_nofail(unsigned long size) { return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL); } static inline void ntfs_free(void *addr) { kvfree(addr); } #endif /* _LINUX_NTFS_MALLOC_H */ |
5 627 1194 812 796 922 924 847 846 844 847 847 847 846 511 510 844 510 512 1 512 845 849 771 755 755 648 394 395 394 394 394 180 393 178 178 394 265 1177 1177 326 256 210 47 257 71 71 2 1160 119 1103 567 579 497 1159 661 908 73 1 1175 64 1176 870 1176 1057 773 770 462 769 204 770 103 769 497 444 323 51 768 5 768 641 713 102 770 766 770 379 344 202 191 62 61 377 634 584 584 422 1 424 10 1286 3 1284 35 1262 230 1252 70 1285 6 1192 584 22 709 19 11 268 263 55 159 36 268 57 57 4 57 2 57 501 11 10 109 501 479 137 113 9 3 1 1 1 25 2 98 21 1176 846 846 688 19 846 846 53 845 97 836 133 793 670 142 769 725 81 722 96 316 278 82 1192 627 626 606 516 414 102 516 90 89 77 13 90 605 10 10 10 10 10 10 10 10 1 1 8 7 8 8 8 8 8 8 8 8 8 8 8 8 8 1 1 1 1 1 1 1 1 1 1 1 536 536 536 536 536 536 536 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 | /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * (C) 2006-2012 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include <linux/types.h> #include <linux/timer.h> #include <linux/module.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/spinlock.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <net/ip6_checksum.h> #include <asm/unaligned.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> /* "Be conservative in what you do, be liberal in what you accept from others." If it's non-zero, we mark only out of window RST segments as INVALID. */ static int nf_ct_tcp_be_liberal __read_mostly = 0; /* If it is set to zero, we disable picking up already established connections. */ static int nf_ct_tcp_loose __read_mostly = 1; /* Max number of the retransmitted packets without receiving an (acceptable) ACK from the destination. If this number is reached, a shorter timer will be started. */ static int nf_ct_tcp_max_retrans __read_mostly = 3; /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR */ static const char *const tcp_conntrack_names[] = { "NONE", "SYN_SENT", "SYN_RECV", "ESTABLISHED", "FIN_WAIT", "CLOSE_WAIT", "LAST_ACK", "TIME_WAIT", "CLOSE", "SYN_SENT2", }; #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS #define DAYS * 24 HOURS static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = { [TCP_CONNTRACK_SYN_SENT] = 2 MINS, [TCP_CONNTRACK_SYN_RECV] = 60 SECS, [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, [TCP_CONNTRACK_FIN_WAIT] = 2 MINS, [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS, [TCP_CONNTRACK_LAST_ACK] = 30 SECS, [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, [TCP_CONNTRACK_CLOSE] = 10 SECS, [TCP_CONNTRACK_SYN_SENT2] = 2 MINS, /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ [TCP_CONNTRACK_RETRANS] = 5 MINS, [TCP_CONNTRACK_UNACK] = 5 MINS, }; #define sNO TCP_CONNTRACK_NONE #define sSS TCP_CONNTRACK_SYN_SENT #define sSR TCP_CONNTRACK_SYN_RECV #define sES TCP_CONNTRACK_ESTABLISHED #define sFW TCP_CONNTRACK_FIN_WAIT #define sCW TCP_CONNTRACK_CLOSE_WAIT #define sLA TCP_CONNTRACK_LAST_ACK #define sTW TCP_CONNTRACK_TIME_WAIT #define sCL TCP_CONNTRACK_CLOSE #define sS2 TCP_CONNTRACK_SYN_SENT2 #define sIV TCP_CONNTRACK_MAX #define sIG TCP_CONNTRACK_IGNORE /* What TCP flags are set from RST/SYN/FIN/ACK. */ enum tcp_bit_set { TCP_SYN_SET, TCP_SYNACK_SET, TCP_FIN_SET, TCP_ACK_SET, TCP_RST_SET, TCP_NONE_SET, }; /* * The TCP state transition table needs a few words... * * We are the man in the middle. All the packets go through us * but might get lost in transit to the destination. * It is assumed that the destinations can't receive segments * we haven't seen. * * The checked segment is in window, but our windows are *not* * equivalent with the ones of the sender/receiver. We always * try to guess the state of the current sender. * * The meaning of the states are: * * NONE: initial state * SYN_SENT: SYN-only packet seen * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open * SYN_RECV: SYN-ACK packet seen * ESTABLISHED: ACK packet seen * FIN_WAIT: FIN packet seen * CLOSE_WAIT: ACK seen (after FIN) * LAST_ACK: FIN seen (after FIN) * TIME_WAIT: last ACK seen * CLOSE: closed connection (RST) * * Packets marked as IGNORED (sIG): * if they may be either invalid or valid * and the receiver may send back a connection * closing RST or a SYN/ACK. * * Packets marked as INVALID (sIV): * if we regard them as truly invalid packets */ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 }, /* * sNO -> sSS Initialize a new connection * sSS -> sSS Retransmitted SYN * sS2 -> sS2 Late retransmitted SYN * sSR -> sIG * sES -> sIG Error: SYNs in window outside the SYN_SENT state * are errors. Receiver will reply with RST * and close the connection. * Or we are not in sync and hold a dead connection. * sFW -> sIG * sCW -> sIG * sLA -> sIG * sTW -> sSS Reopened connection (RFC 1122). * sCL -> sSS */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR }, /* * sNO -> sIV Too late and no reason to do anything * sSS -> sIV Client can't send SYN and then SYN/ACK * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open * sES -> sIV Invalid SYN/ACK packets sent by the client * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sIV * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sNO -> sIV Too late and no reason to do anything... * sSS -> sIV Client migth not send FIN in this state: * we enforce waiting for a SYN/ACK reply first. * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions, waiting for * the last ACK. * Migth be a retransmitted FIN as well... * sCW -> sLA * sLA -> sLA Retransmitted FIN. Remain in the same state. * sTW -> sTW * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* * sNO -> sES Assumed. * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. * sS2 -> sIV * sSR -> sES Established state is reached. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } }, { /* REPLY */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 }, /* * sNO -> sIV Never reached. * sSS -> sS2 Simultaneous open * sS2 -> sS2 Retransmitted simultaneous SYN * sSR -> sIV Invalid SYN packets sent by the server * sES -> sIV * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sSS Reopened connection, but server may have switched role * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, /* * sSS -> sSR Standard open. * sS2 -> sSR Simultaneous open * sSR -> sIG Retransmitted SYN/ACK, ignore it. * sES -> sIG Late retransmitted SYN/ACK? * sFW -> sIG Might be SYN/ACK answering ignored SYN * sCW -> sIG * sLA -> sIG * sTW -> sIG * sCL -> sIG */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sSS -> sIV Server might not send FIN in this state. * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions. * sCW -> sLA * sLA -> sLA Retransmitted FIN. * sTW -> sTW * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG }, /* * sSS -> sIG Might be a half-open connection. * sS2 -> sIG * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. * sCW -> sCW * sLA -> sTW Last ACK detected (RFC5961 challenged) * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } } }; static inline struct nf_tcp_net *tcp_pernet(struct net *net) { return &net->ct.nf_ct_proto.tcp; } #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) return; seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]); } #endif static unsigned int get_conntrack_index(const struct tcphdr *tcph) { if (tcph->rst) return TCP_RST_SET; else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); else if (tcph->fin) return TCP_FIN_SET; else if (tcph->ack) return TCP_ACK_SET; else return TCP_NONE_SET; } /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering in IP Filter' by Guido van Rooij. http://www.sane.nl/events/sane2000/papers.html http://www.darkart.com/mirrors/www.obfuscation.org/ipf/ The boundaries and the conditions are changed according to RFC793: the packet must intersect the window (i.e. segments may be after the right or before the left edge) and thus receivers may ACK segments after the right edge of the window. td_maxend = max(sack + max(win,1)) seen in reply packets td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets td_maxwin += seq + len - sender.td_maxend if seq + len > sender.td_maxend td_end = max(seq + len) seen in sent packets I. Upper bound for valid data: seq <= sender.td_maxend II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin III. Upper bound for valid (s)ack: sack <= receiver.td_end IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW where sack is the highest right edge of sack block found in the packet or ack in the case of packet without SACK option. The upper bound limit for a valid (s)ack is not ignored - we doesn't have to deal with fragments. */ static inline __u32 segment_seq_plus_len(__u32 seq, size_t len, unsigned int dataoff, const struct tcphdr *tcph) { /* XXX Should I use payload length field in IP/IPv6 header ? * - YK */ return (seq + len - dataoff - tcph->doff*4 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); } /* Fixme: what about big packets? */ #define MAXACKWINCONST 66000 #define MAXACKWINDOW(sender) \ ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ : MAXACKWINCONST) /* * Simplified tcp_parse_options routine from tcp_input.c */ static void tcp_options(const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, struct ip_ct_tcp_state *state) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); if (!length) return; ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); BUG_ON(ptr == NULL); state->td_scale = 0; state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; while (length > 0) { int opcode=*ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize=*ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_SACK_PERM && opsize == TCPOLEN_SACK_PERM) state->flags |= IP_CT_TCP_FLAG_SACK_PERM; else if (opcode == TCPOPT_WINDOW && opsize == TCPOLEN_WINDOW) { state->td_scale = *(u_int8_t *)ptr; if (state->td_scale > TCP_MAX_WSCALE) state->td_scale = TCP_MAX_WSCALE; state->flags |= IP_CT_TCP_FLAG_WINDOW_SCALE; } ptr += opsize - 2; length -= opsize; } } } static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, __u32 *sack) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); __u32 tmp; if (!length) return; ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); BUG_ON(ptr == NULL); /* Fast path for timestamp-only option */ if (length == TCPOLEN_TSTAMP_ALIGNED && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) return; while (length > 0) { int opcode = *ptr++; int opsize, i; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_SACK && opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK)) { for (i = 0; i < (opsize - TCPOLEN_SACK_BASE); i += TCPOLEN_SACK_PERBLOCK) { tmp = get_unaligned_be32((__be32 *)(ptr+i)+1); if (after(tmp, *sack)) *sack = tmp; } return; } ptr += opsize - 2; length -= opsize; } } } static bool tcp_in_window(const struct nf_conn *ct, struct ip_ct_tcp *state, enum ip_conntrack_dir dir, unsigned int index, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = tcp_pernet(net); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; u16 win_raw; s32 receiver_offset; bool res, in_recv_win; /* * Get the required data from the packet. */ seq = ntohl(tcph->seq); ack = sack = ntohl(tcph->ack_seq); win_raw = ntohs(tcph->window); win = win_raw; end = segment_seq_plus_len(seq, skb->len, dataoff, tcph); if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) tcp_sack(skb, dataoff, tcph, &sack); /* Take into account NAT sequence number mangling */ receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1); ack -= receiver_offset; sack -= receiver_offset; pr_debug("tcp_in_window: START\n"); pr_debug("tcp_in_window: "); nf_ct_dump_tuple(tuple); pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", seq, ack, receiver_offset, sack, receiver_offset, win, end); pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, sender->td_scale, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); if (sender->td_maxwin == 0) { /* * Initialize sender data. */ if (tcph->syn) { /* * SYN-ACK in reply to a SYN * or SYN from reply direction in simultaneous open. */ sender->td_end = sender->td_maxend = end; sender->td_maxwin = (win == 0 ? 1 : win); tcp_options(skb, dataoff, tcph, sender); /* * RFC 1323: * Both sides must send the Window Scale option * to enable window scaling in either direction. */ if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) sender->td_scale = receiver->td_scale = 0; if (!tcph->ack) /* Simultaneous open */ return true; } else { /* * We are in the middle of a connection, * its history is lost for us. * Let's try to use the data from the packet. */ sender->td_end = end; swin = win << sender->td_scale; sender->td_maxwin = (swin == 0 ? 1 : swin); sender->td_maxend = end + sender->td_maxwin; if (receiver->td_maxwin == 0) { /* We haven't seen traffic in the other * direction yet but we have to tweak window * tracking to pass III and IV until that * happens. */ receiver->td_end = receiver->td_maxend = sack; } else if (sack == receiver->td_end + 1) { /* Likely a reply to a keepalive. * Needed for III. */ receiver->td_end++; } } } else if (((state->state == TCP_CONNTRACK_SYN_SENT && dir == IP_CT_DIR_ORIGINAL) || (state->state == TCP_CONNTRACK_SYN_RECV && dir == IP_CT_DIR_REPLY)) && after(end, sender->td_end)) { /* * RFC 793: "if a TCP is reinitialized ... then it need * not wait at all; it must only be sure to use sequence * numbers larger than those recently used." */ sender->td_end = sender->td_maxend = end; sender->td_maxwin = (win == 0 ? 1 : win); tcp_options(skb, dataoff, tcph, sender); } if (!(tcph->ack)) { /* * If there is no ACK, just pretend it was set and OK. */ ack = sack = receiver->td_end; } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == (TCP_FLAG_ACK|TCP_FLAG_RST)) && (ack == 0)) { /* * Broken TCP stacks, that set ACK in RST packets as well * with zero ack value. */ ack = sack = receiver->td_end; } if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT) /* * RST sent answering SYN. */ seq = end = sender->td_end; pr_debug("tcp_in_window: "); nf_ct_dump_tuple(tuple); pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", seq, ack, receiver_offset, sack, receiver_offset, win, end); pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, sender->td_scale, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); /* Is the ending sequence in the receive window (if available)? */ in_recv_win = !receiver->td_maxwin || after(end, sender->td_end - receiver->td_maxwin - 1); pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n", before(seq, sender->td_maxend + 1), (in_recv_win ? 1 : 0), before(sack, receiver->td_end + 1), after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); if (before(seq, sender->td_maxend + 1) && in_recv_win && before(sack, receiver->td_end + 1) && after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { /* * Take into account window scaling (RFC 1323). */ if (!tcph->syn) win <<= sender->td_scale; /* * Update sender data. */ swin = win + (sack - ack); if (sender->td_maxwin < swin) sender->td_maxwin = swin; if (after(end, sender->td_end)) { sender->td_end = end; sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; } if (tcph->ack) { if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { sender->td_maxack = ack; sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; } else if (after(ack, sender->td_maxack)) sender->td_maxack = ack; } /* * Update receiver data. */ if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) receiver->td_maxwin += end - sender->td_maxend; if (after(sack + win, receiver->td_maxend - 1)) { receiver->td_maxend = sack + win; if (win == 0) receiver->td_maxend++; } if (ack == receiver->td_end) receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; /* * Check retransmissions. */ if (index == TCP_ACK_SET) { if (state->last_dir == dir && state->last_seq == seq && state->last_ack == ack && state->last_end == end && state->last_win == win_raw) state->retrans++; else { state->last_dir = dir; state->last_seq = seq; state->last_ack = ack; state->last_end = end; state->last_win = win_raw; state->retrans = 0; } } res = true; } else { res = false; if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal) res = true; if (!res) { nf_ct_l4proto_log_invalid(skb, ct, "%s", before(seq, sender->td_maxend + 1) ? in_recv_win ? before(sack, receiver->td_end + 1) ? after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" : "ACK is under the lower bound (possible overly delayed ACK)" : "ACK is over the upper bound (ACKed data not seen yet)" : "SEQ is under the lower bound (already ACKed data retransmitted)" : "SEQ is over the upper bound (over the window of the receiver)"); } } pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u " "receiver end=%u maxend=%u maxwin=%u\n", res, sender->td_end, sender->td_maxend, sender->td_maxwin, receiver->td_end, receiver->td_maxend, receiver->td_maxwin); return res; } /* table of valid flag combinations - PUSH, ECE and CWR are always valid */ static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK| TCPHDR_URG) + 1] = { [TCPHDR_SYN] = 1, [TCPHDR_SYN|TCPHDR_URG] = 1, [TCPHDR_SYN|TCPHDR_ACK] = 1, [TCPHDR_RST] = 1, [TCPHDR_RST|TCPHDR_ACK] = 1, [TCPHDR_FIN|TCPHDR_ACK] = 1, [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG] = 1, [TCPHDR_ACK] = 1, [TCPHDR_ACK|TCPHDR_URG] = 1, }; static void tcp_error_log(const struct sk_buff *skb, struct net *net, u8 pf, const char *msg) { nf_l4proto_log_invalid(skb, net, pf, IPPROTO_TCP, "%s", msg); } /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ static int tcp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int8_t pf, unsigned int hooknum) { const struct tcphdr *th; struct tcphdr _tcph; unsigned int tcplen = skb->len - dataoff; u_int8_t tcpflags; /* Smaller that minimal TCP header? */ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) { tcp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { tcp_error_log(skb, net, pf, "truncated packet"); return -NF_ACCEPT; } /* Checksum invalid? Ignore. * We skip checking packets on the outgoing path * because the checksum is assumed to be correct. */ /* FIXME: Source route IP option packets --RR */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { tcp_error_log(skb, net, pf, "bad checksum"); return -NF_ACCEPT; } /* Check TCP flags. */ tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH)); if (!tcp_valid_flags[tcpflags]) { tcp_error_log(skb, net, pf, "invalid tcp flag combination"); return -NF_ACCEPT; } return NF_ACCEPT; } static bool nf_conntrack_tcp_established(const struct nf_conn *ct) { return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED && test_bit(IPS_ASSURED_BIT, &ct->status); } static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state) { state->td_end = 0; state->td_maxend = 0; state->td_maxwin = 0; state->td_maxack = 0; state->td_scale = 0; state->flags &= IP_CT_TCP_FLAG_BE_LIBERAL; } /* Returns verdict for packet, or -1 for invalid. */ static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo) { struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = tcp_pernet(net); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; enum ip_conntrack_dir dir; const struct tcphdr *th; struct tcphdr _tcph; unsigned long timeout; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); spin_lock_bh(&ct->lock); old_state = ct->proto.tcp.state; dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); new_state = tcp_conntracks[dir][index][old_state]; tuple = &ct->tuplehash[dir].tuple; switch (new_state) { case TCP_CONNTRACK_SYN_SENT: if (old_state < TCP_CONNTRACK_TIME_WAIT) break; /* RFC 1122: "When a connection is closed actively, * it MUST linger in TIME-WAIT state for a time 2xMSL * (Maximum Segment Lifetime). However, it MAY accept * a new SYN from the remote TCP to reopen the connection * directly from TIME-WAIT state, if..." * We ignore the conditions because we are in the * TIME-WAIT state anyway. * * Handle aborted connections: we and the server * think there is an existing connection but the client * aborts it and starts a new one. */ if (((ct->proto.tcp.seen[dir].flags | ct->proto.tcp.seen[!dir].flags) & IP_CT_TCP_FLAG_CLOSE_INIT) || (ct->proto.tcp.last_dir == dir && ct->proto.tcp.last_index == TCP_RST_SET)) { /* Attempt to reopen a closed/aborted connection. * Delete this connection and look up again. */ spin_unlock_bh(&ct->lock); /* Only repeat if we can actually remove the timer. * Destruction may already be in progress in process * context and we must give it a chance to terminate. */ if (nf_ct_kill(ct)) return -NF_REPEAT; return NF_DROP; } /* Fall through */ case TCP_CONNTRACK_IGNORE: /* Ignored packets: * * Our connection entry may be out of sync, so ignore * packets which may signal the real connection between * the client and the server. * * a) SYN in ORIGINAL * b) SYN/ACK in REPLY * c) ACK in reply direction after initial SYN in original. * * If the ignored packet is invalid, the receiver will send * a RST we'll catch below. */ if (index == TCP_SYNACK_SET && ct->proto.tcp.last_index == TCP_SYN_SET && ct->proto.tcp.last_dir != dir && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { /* b) This SYN/ACK acknowledges a SYN that we earlier * ignored as invalid. This means that the client and * the server are both in sync, while the firewall is * not. We get in sync from the previously annotated * values. */ old_state = TCP_CONNTRACK_SYN_SENT; new_state = TCP_CONNTRACK_SYN_RECV; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end = ct->proto.tcp.last_end; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend = ct->proto.tcp.last_end; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin = ct->proto.tcp.last_win == 0 ? 1 : ct->proto.tcp.last_win; ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = ct->proto.tcp.last_wscale; ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = ct->proto.tcp.last_flags; nf_ct_tcp_state_reset(&ct->proto.tcp.seen[dir]); break; } ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; ct->proto.tcp.last_seq = ntohl(th->seq); ct->proto.tcp.last_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); ct->proto.tcp.last_win = ntohs(th->window); /* a) This is a SYN in ORIGINAL. The client and the server * may be in sync but we are not. In that case, we annotate * the TCP options and let the packet go through. If it is a * valid SYN packet, the server will reply with a SYN/ACK, and * then we'll get in sync. Otherwise, the server potentially * responds with a challenge ACK if implementing RFC5961. */ if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { struct ip_ct_tcp_state seen = {}; ct->proto.tcp.last_flags = ct->proto.tcp.last_wscale = 0; tcp_options(skb, dataoff, th, &seen); if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { ct->proto.tcp.last_flags |= IP_CT_TCP_FLAG_WINDOW_SCALE; ct->proto.tcp.last_wscale = seen.td_scale; } if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) { ct->proto.tcp.last_flags |= IP_CT_TCP_FLAG_SACK_PERM; } /* Mark the potential for RFC5961 challenge ACK, * this pose a special problem for LAST_ACK state * as ACK is intrepretated as ACKing last FIN. */ if (old_state == TCP_CONNTRACK_LAST_ACK) ct->proto.tcp.last_flags |= IP_CT_EXP_CHALLENGE_ACK; } spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, "invalid packet ignored in " "state %s ", tcp_conntrack_names[old_state]); return NF_ACCEPT; case TCP_CONNTRACK_MAX: /* Special case for SYN proxy: when the SYN to the server or * the SYN/ACK from the server is lost, the client may transmit * a keep-alive packet while in SYN_SENT state. This needs to * be associated with the original conntrack entry in order to * generate a new SYN with the correct sequence number. */ if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT && index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL && ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL && ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) { pr_debug("nf_ct_tcp: SYN proxy client keep alive\n"); spin_unlock_bh(&ct->lock); return NF_ACCEPT; } /* Invalid packet */ pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, "invalid state"); return -NF_ACCEPT; case TCP_CONNTRACK_TIME_WAIT: /* RFC5961 compliance cause stack to send "challenge-ACK" * e.g. in response to spurious SYNs. Conntrack MUST * not believe this ACK is acking last FIN. */ if (old_state == TCP_CONNTRACK_LAST_ACK && index == TCP_ACK_SET && ct->proto.tcp.last_dir != dir && ct->proto.tcp.last_index == TCP_SYN_SET && (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) { /* Detected RFC5961 challenge ACK */ ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK; spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored"); return NF_ACCEPT; /* Don't change state */ } break; case TCP_CONNTRACK_SYN_SENT2: /* tcp_conntracks table is not smart enough to handle * simultaneous open. */ ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN; break; case TCP_CONNTRACK_SYN_RECV: if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET && ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN) new_state = TCP_CONNTRACK_ESTABLISHED; break; case TCP_CONNTRACK_CLOSE: if (index != TCP_RST_SET) break; if (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) { u32 seq = ntohl(th->seq); if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) { /* Invalid RST */ spin_unlock_bh(&ct->lock); nf_ct_l4proto_log_invalid(skb, ct, "invalid rst"); return -NF_ACCEPT; } if (!nf_conntrack_tcp_established(ct) || seq == ct->proto.tcp.seen[!dir].td_maxack) break; /* Check if rst is part of train, such as * foo:80 > bar:4379: P, 235946583:235946602(19) ack 42 * foo:80 > bar:4379: R, 235946602:235946602(0) ack 42 */ if (ct->proto.tcp.last_index == TCP_ACK_SET && ct->proto.tcp.last_dir == dir && seq == ct->proto.tcp.last_end) break; /* ... RST sequence number doesn't match exactly, keep * established state to allow a possible challenge ACK. */ new_state = old_state; } if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) && ct->proto.tcp.last_index == TCP_SYN_SET) || (!test_bit(IPS_ASSURED_BIT, &ct->status) && ct->proto.tcp.last_index == TCP_ACK_SET)) && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { /* RST sent to invalid SYN or ACK we had let through * at a) and c) above: * * a) SYN was in window then * c) we hold a half-open connection. * * Delete our connection entry. * We skip window checking, because packet might ACK * segments we ignored. */ goto in_window; } break; default: /* Keep compilers happy. */ break; } if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, skb, dataoff, th)) { spin_unlock_bh(&ct->lock); return -NF_ACCEPT; } in_window: /* From now on we have got in-window packets */ ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; pr_debug("tcp_conntracks: "); nf_ct_dump_tuple(tuple); pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", (th->syn ? 1 : 0), (th->ack ? 1 : 0), (th->fin ? 1 : 0), (th->rst ? 1 : 0), old_state, new_state); ct->proto.tcp.state = new_state; if (old_state != new_state && new_state == TCP_CONNTRACK_FIN_WAIT) ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = tn->timeouts; if (ct->proto.tcp.retrans >= tn->tcp_max_retrans && timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) timeout = timeouts[TCP_CONNTRACK_RETRANS]; else if (unlikely(index == TCP_RST_SET)) timeout = timeouts[TCP_CONNTRACK_CLOSE]; else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) & IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; else if (ct->proto.tcp.last_win == 0 && timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS]) timeout = timeouts[TCP_CONNTRACK_RETRANS]; else timeout = timeouts[new_state]; spin_unlock_bh(&ct->lock); if (new_state != old_state) nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { /* If only reply is a RST, we can consider ourselves not to have an established connection: this is a fairly common problem case, so we can delete the conntrack immediately. --RR */ if (th->rst) { nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection * pickup with loose=1. Avoid large ESTABLISHED timeout. */ if (new_state == TCP_CONNTRACK_ESTABLISHED && timeout > timeouts[TCP_CONNTRACK_UNACK]) timeout = timeouts[TCP_CONNTRACK_UNACK]; } else if (!test_bit(IPS_ASSURED_BIT, &ct->status) && (old_state == TCP_CONNTRACK_SYN_RECV || old_state == TCP_CONNTRACK_ESTABLISHED) && new_state == TCP_CONNTRACK_ESTABLISHED) { /* Set ASSURED if we see see valid ack in ESTABLISHED after SYN_RECV or a valid answer for a picked up connection. */ set_bit(IPS_ASSURED_BIT, &ct->status); nf_conntrack_event_cache(IPCT_ASSURED, ct); } nf_ct_refresh_acct(ct, ctinfo, skb, timeout); return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff) { enum tcp_conntrack new_state; const struct tcphdr *th; struct tcphdr _tcph; struct net *net = nf_ct_net(ct); struct nf_tcp_net *tn = tcp_pernet(net); const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); /* Don't need lock here: this conntrack not in circulation yet */ new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { pr_debug("nf_ct_tcp: invalid new deleting.\n"); return false; } if (new_state == TCP_CONNTRACK_SYN_SENT) { memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* SYN packet */ ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); if (ct->proto.tcp.seen[0].td_maxwin == 0) ct->proto.tcp.seen[0].td_maxwin = 1; ct->proto.tcp.seen[0].td_maxend = ct->proto.tcp.seen[0].td_end; tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); } else if (tn->tcp_loose == 0) { /* Don't try to pick up connections. */ return false; } else { memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* * We are in the middle of a connection, * its history is lost for us. * Let's try to use the data from the packet. */ ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); if (ct->proto.tcp.seen[0].td_maxwin == 0) ct->proto.tcp.seen[0].td_maxwin = 1; ct->proto.tcp.seen[0].td_maxend = ct->proto.tcp.seen[0].td_end + ct->proto.tcp.seen[0].td_maxwin; /* We assume SACK and liberal window checking to handle * window scaling */ ct->proto.tcp.seen[0].flags = ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | IP_CT_TCP_FLAG_BE_LIBERAL; } /* tcp_packet will set them */ ct->proto.tcp.last_index = TCP_NONE_SET; pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, sender->td_scale, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); return true; } static bool tcp_can_early_drop(const struct nf_conn *ct) { switch (ct->proto.tcp.state) { case TCP_CONNTRACK_FIN_WAIT: case TCP_CONNTRACK_LAST_ACK: case TCP_CONNTRACK_TIME_WAIT: case TCP_CONNTRACK_CLOSE: case TCP_CONNTRACK_CLOSE_WAIT: return true; default: break; } return false; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct) { struct nlattr *nest_parms; struct nf_ct_tcp_flags tmp = {}; spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) || nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, ct->proto.tcp.seen[0].td_scale) || nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, ct->proto.tcp.seen[1].td_scale)) goto nla_put_failure; tmp.flags = ct->proto.tcp.seen[0].flags; if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, sizeof(struct nf_ct_tcp_flags), &tmp)) goto nla_put_failure; tmp.flags = ct->proto.tcp.seen[1].flags; if (nla_put(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, sizeof(struct nf_ct_tcp_flags), &tmp)) goto nla_put_failure; spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: spin_unlock_bh(&ct->lock); return -1; } static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 }, [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) }, [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) }, }; #define TCP_NLATTR_SIZE ( \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + 1) + \ NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags)) + \ NLA_ALIGN(NLA_HDRLEN + sizeof(struct nf_ct_tcp_flags))) static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) { struct nlattr *pattr = cda[CTA_PROTOINFO_TCP]; struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1]; int err; /* updates could not contain anything about the private * protocol info, in that case skip the parsing */ if (!pattr) return 0; err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy, NULL); if (err < 0) return err; if (tb[CTA_PROTOINFO_TCP_STATE] && nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX) return -EINVAL; spin_lock_bh(&ct->lock); if (tb[CTA_PROTOINFO_TCP_STATE]) ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { struct nf_ct_tcp_flags *attr = nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]); ct->proto.tcp.seen[0].flags &= ~attr->mask; ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask; } if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) { struct nf_ct_tcp_flags *attr = nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]); ct->proto.tcp.seen[1].flags &= ~attr->mask; ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask; } if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] && tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] && ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE && ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { ct->proto.tcp.seen[0].td_scale = nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]); ct->proto.tcp.seen[1].td_scale = nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); } spin_unlock_bh(&ct->lock); return 0; } static unsigned int tcp_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); return size; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { struct nf_tcp_net *tn = tcp_pernet(net); unsigned int *timeouts = data; int i; if (!timeouts) timeouts = tn->timeouts; /* set default TCP timeouts. */ for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++) timeouts[i] = tn->timeouts[i]; if (tb[CTA_TIMEOUT_TCP_SYN_SENT]) { timeouts[TCP_CONNTRACK_SYN_SENT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) { timeouts[TCP_CONNTRACK_SYN_RECV] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ; } if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) { timeouts[TCP_CONNTRACK_ESTABLISHED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ; } if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) { timeouts[TCP_CONNTRACK_FIN_WAIT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) { timeouts[TCP_CONNTRACK_CLOSE_WAIT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) { timeouts[TCP_CONNTRACK_LAST_ACK] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ; } if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) { timeouts[TCP_CONNTRACK_TIME_WAIT] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ; } if (tb[CTA_TIMEOUT_TCP_CLOSE]) { timeouts[TCP_CONNTRACK_CLOSE] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ; } if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) { timeouts[TCP_CONNTRACK_SYN_SENT2] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ; } if (tb[CTA_TIMEOUT_TCP_RETRANS]) { timeouts[TCP_CONNTRACK_RETRANS] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ; } if (tb[CTA_TIMEOUT_TCP_UNACK]) { timeouts[TCP_CONNTRACK_UNACK] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ; } timeouts[CTA_TIMEOUT_TCP_UNSPEC] = timeouts[CTA_TIMEOUT_TCP_SYN_SENT]; return 0; } static int tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT, htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV, htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED, htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT, htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT, htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK, htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT, htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE, htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2, htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS, htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK, htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = { [CTA_TIMEOUT_TCP_SYN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_RETRANS] = { .type = NLA_U32 }, [CTA_TIMEOUT_TCP_UNACK] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_SYSCTL static struct ctl_table tcp_sysctl_table[] = { { .procname = "nf_conntrack_tcp_timeout_syn_sent", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_syn_recv", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_established", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_fin_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_close_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_last_ack", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_time_wait", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_close", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_max_retrans", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_unacknowledged", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_loose", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "nf_conntrack_tcp_be_liberal", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "nf_conntrack_tcp_max_retrans", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, }, { } }; #endif /* CONFIG_SYSCTL */ static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn, struct nf_tcp_net *tn) { #ifdef CONFIG_SYSCTL if (pn->ctl_table) return 0; pn->ctl_table = kmemdup(tcp_sysctl_table, sizeof(tcp_sysctl_table), GFP_KERNEL); if (!pn->ctl_table) return -ENOMEM; pn->ctl_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT]; pn->ctl_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV]; pn->ctl_table[2].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; pn->ctl_table[3].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT]; pn->ctl_table[4].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT]; pn->ctl_table[5].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK]; pn->ctl_table[6].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT]; pn->ctl_table[7].data = &tn->timeouts[TCP_CONNTRACK_CLOSE]; pn->ctl_table[8].data = &tn->timeouts[TCP_CONNTRACK_RETRANS]; pn->ctl_table[9].data = &tn->timeouts[TCP_CONNTRACK_UNACK]; pn->ctl_table[10].data = &tn->tcp_loose; pn->ctl_table[11].data = &tn->tcp_be_liberal; pn->ctl_table[12].data = &tn->tcp_max_retrans; #endif return 0; } static int tcp_init_net(struct net *net, u_int16_t proto) { struct nf_tcp_net *tn = tcp_pernet(net); struct nf_proto_net *pn = &tn->pn; if (!pn->users) { int i; for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++) tn->timeouts[i] = tcp_timeouts[i]; /* timeouts[0] is unused, make it same as SYN_SENT so * ->timeouts[0] contains 'new' timeout, like udp or icmp. */ tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT]; tn->tcp_loose = nf_ct_tcp_loose; tn->tcp_be_liberal = nf_ct_tcp_be_liberal; tn->tcp_max_retrans = nf_ct_tcp_max_retrans; } return tcp_kmemdup_sysctl_table(pn, tn); } static struct nf_proto_net *tcp_get_net_proto(struct net *net) { return &net->ct.nf_ct_proto.tcp.pn; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 = { .l3proto = PF_INET, .l4proto = IPPROTO_TCP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif .packet = tcp_packet, .new = tcp_new, .error = tcp_error, .can_early_drop = tcp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = tcp_to_nlattr, .from_nlattr = nlattr_to_tcp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = tcp_nlattr_tuple_size, .nlattr_size = TCP_NLATTR_SIZE, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = tcp_timeout_nlattr_to_obj, .obj_to_nlattr = tcp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_TCP_MAX, .obj_size = sizeof(unsigned int) * TCP_CONNTRACK_TIMEOUT_MAX, .nla_policy = tcp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = tcp_init_net, .get_net_proto = tcp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4); const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 = { .l3proto = PF_INET6, .l4proto = IPPROTO_TCP, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = tcp_print_conntrack, #endif .packet = tcp_packet, .new = tcp_new, .error = tcp_error, .can_early_drop = tcp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .nlattr_size = TCP_NLATTR_SIZE, .to_nlattr = tcp_to_nlattr, .from_nlattr = nlattr_to_tcp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nlattr_tuple_size = tcp_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = tcp_timeout_nlattr_to_obj, .obj_to_nlattr = tcp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_TCP_MAX, .obj_size = sizeof(unsigned int) * TCP_CONNTRACK_TIMEOUT_MAX, .nla_policy = tcp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = tcp_init_net, .get_net_proto = tcp_get_net_proto, }; EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6); |
215 215 235 41 32 235 58 57 1222 1223 1221 521 521 521 519 520 520 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. These functions are needed by GSO/GRO implementation. */ #include <linux/export.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/addrconf.h> #include <net/secure_seq.h> #include <linux/netfilter.h> static u32 __ipv6_select_ident(struct net *net, const struct in6_addr *dst, const struct in6_addr *src) { u32 id; do { id = prandom_u32(); } while (!id); return id; } /* This function exists only for tap drivers that must support broken * clients requesting UFO without specifying an IPv6 fragment ID. * * This is similar to ipv6_select_ident() but we use an independent hash * seed to limit information leakage. * * The network header must be set before calling this. */ __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb) { struct in6_addr buf[2]; struct in6_addr *addrs; u32 id; addrs = skb_header_pointer(skb, skb_network_offset(skb) + offsetof(struct ipv6hdr, saddr), sizeof(buf), buf); if (!addrs) return 0; id = __ipv6_select_ident(net, &addrs[1], &addrs[0]); return htonl(id); } EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr) { u32 id; id = __ipv6_select_ident(net, daddr, saddr); return htonl(id); } EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { unsigned int offset = sizeof(struct ipv6hdr); unsigned int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); int found_rhdr = 0; *nexthdr = &ipv6_hdr(skb)->nexthdr; while (offset <= packet_len) { struct ipv6_opt_hdr *exthdr; switch (**nexthdr) { case NEXTHDR_HOP: break; case NEXTHDR_ROUTING: found_rhdr = 1; break; case NEXTHDR_DEST: #if IS_ENABLED(CONFIG_IPV6_MIP6) if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) break; #endif if (found_rhdr) return offset; break; default: return offset; } if (offset + sizeof(struct ipv6_opt_hdr) > packet_len) return -EINVAL; exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); offset += ipv6_optlen(exthdr); if (offset > IPV6_MAXPLEN) return -EINVAL; *nexthdr = &exthdr->nexthdr; } return -EINVAL; } EXPORT_SYMBOL(ip6_find_1stfragopt); #if IS_ENABLED(CONFIG_IPV6) int ip6_dst_hoplimit(struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); if (hoplimit == 0) { struct net_device *dev = dst->dev; struct inet6_dev *idev; rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) hoplimit = idev->cnf.hop_limit; else hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit; rcu_read_unlock(); } return hoplimit; } EXPORT_SYMBOL(ip6_dst_hoplimit); #endif int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int len; len = skb->len - sizeof(struct ipv6hdr); if (len > IPV6_MAXPLEN) len = 0; ipv6_hdr(skb)->payload_len = htons(len); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_out(sk, skb); if (unlikely(!skb)) return 0; skb->protocol = htons(ETH_P_IPV6); return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); } EXPORT_SYMBOL_GPL(__ip6_local_out); int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { int err; err = __ip6_local_out(net, sk, skb); if (likely(err == 1)) err = dst_output(net, sk, skb); return err; } EXPORT_SYMBOL_GPL(ip6_local_out); |
2390 52 223 131 2083 2083 151 2083 1540 1862 2083 168 169 169 169 165 917 2173 2173 169 168 169 17 169 169 37 165 12 12 12 172 90 2 172 158 172 172 159 158 159 159 159 159 159 158 157 158 158 158 170 170 170 2 169 59 150 150 60 60 148 148 60 150 150 147 43 2256 11 11 11 10 21 21 21 12 10 12 12 12 2256 1949 2254 2254 1852 1853 1853 1853 1851 545 545 545 149 640 642 308 1 308 11 632 2108 2108 2108 915 512 512 483 512 360 511 438 438 147 512 438 648 648 648 648 642 278 642 642 631 95 12 12 274 192 615 455 641 454 625 641 376 394 641 512 641 648 1496 1496 1496 277 1495 1485 282 282 282 273 1486 1851 1851 1850 802 1851 1803 1851 1841 1851 594 1729 1004 1851 1850 1851 1851 1410 1410 1410 1410 1851 1851 1851 1798 1799 1799 1056 1360 1056 1360 913 913 913 1360 1360 1360 1081 1360 1095 1095 1045 149 410 990 987 1046 111 111 111 111 111 111 111 1825 1825 350 350 21 350 348 286 2 2 286 286 286 286 287 287 784 784 784 783 784 784 1360 1360 1360 1360 1359 1360 1360 1358 1823 1823 1822 1823 109 1816 1816 808 808 793 793 140 1315 29 1825 205 1825 1825 1824 1822 1823 943 942 881 1823 1823 1823 1823 393 1821 1823 393 1816 1799 784 1360 1 1360 1820 111 111 6 69 1820 154 154 154 154 154 154 154 154 154 14 1 153 154 154 154 154 154 154 154 154 154 153 154 154 154 154 9 154 154 154 154 154 10 10 10 10 10 10 10 10 10 10 2056 2056 2056 2056 2056 2056 2056 2056 1971 43 2056 1648 2055 2021 2021 2055 2055 2055 125 125 907 834 1825 949 329 834 834 820 754 606 541 182 327 40 303 303 498 833 345 173 833 10 834 834 89 89 9 89 57 57 1 89 9 834 834 42 9 833 343 2078 2078 8 8 8 650 650 650 650 622 600 600 15 9 1242 2081 2081 323 314 308 117 308 308 638 638 638 600 638 3 3 3 158 1021 91 91 10 91 91 638 638 637 638 638 38 38 38 30 38 38 38 638 621 636 638 638 638 638 109 109 109 109 109 109 109 109 109 109 700 109 637 274 274 274 273 274 274 274 274 10 273 30 30 30 30 30 30 36 36 36 36 36 36 36 30 17 67 1063 1265 1264 258 258 258 258 258 258 258 258 1241 450 5 1237 881 1239 638 638 2081 2080 2081 2081 622 36 2078 622 622 1020 2078 2078 2078 637 67 67 2085 2087 40 2087 1668 1669 44 1663 26 1663 1637 1664 2082 2080 2080 1823 1824 1820 1799 700 2076 2056 66 67 36 2057 43 2078 2077 339 2078 43 30 502 21 488 502 21 21 502 3 3 1 501 8 1 1 7 501 284 285 285 284 501 497 35 497 493 493 493 493 493 493 493 493 493 2 2 491 493 493 474 474 493 492 493 493 34 498 52 52 3 52 52 52 52 52 52 52 51 55 54 53 33 53 52 52 52 50 52 52 52 52 50 53 56 81 81 81 79 81 79 79 81 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> */ /* * mballoc.c contains the multiblocks allocation routines */ #include "ext4_jbd2.h" #include "mballoc.h" #include <linux/log2.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/backing-dev.h> #include <trace/events/ext4.h> #ifdef CONFIG_EXT4_DEBUG ushort ext4_mballoc_debug __read_mostly; module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); #endif /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() * - search for metadata in few groups * * TODO v4: * - normalization should take into account whether file is still open * - discard preallocations if no free space left (policy?) * - don't normalize tails * - quota * - reservation for superuser * * TODO v3: * - bitmap read-ahead (proposed by Oleg Drokin aka green) * - track min/max extents in each group for better group selection * - mb_mark_used() may allocate chunk right after splitting buddy * - tree of groups sorted by number of free blocks * - error handling */ /* * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * * During initialization phase of the allocator we decide to use the * group preallocation or inode preallocation depending on the size of * the file. The size of the file could be the resulting file size we * would have after allocation, or the current file size, which ever * is larger. If the size is less than sbi->s_mb_stream_request we * select to use the group preallocation. The default value of * s_mb_stream_request is 16 blocks. This can also be tuned via * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in * terms of number of blocks. * * The main motivation for having small file use group preallocation is to * ensure that we have small files closer together on the disk. * * First stage the allocator looks at the inode prealloc list, * ext4_inode_info->i_prealloc_list, which contains list of prealloc * spaces for this particular inode. The inode prealloc space is * represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space * pa_len -> length for this prealloc space (in clusters) * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc * space we will consume the particular prealloc space. This makes sure that * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except * pa_free. * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * * The reason for having a per cpu locality group is to reduce the contention * between CPUs. It is possible to get scheduled at this point. * * The locality group prealloc space is used looking at whether we have * enough free space (pa_free) within the prealloc space. * * If we can't allocate blocks via inode prealloc or/and locality group * prealloc then we look at the buddy cache. The buddy cache is represented * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets * mapped to the buddy and bitmap information regarding different * groups. The buddy information is attached to buddy cache inode so that * we can access them through the page cache. The information regarding * each group is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are stored in the * inode as: * * { page } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / * blocksize) blocks. So it can have information regarding groups_per_page * which is blocks_per_page/2 * * The buddy cache inode is not stored on disk. The inode is thrown * away when the filesystem is unmounted. * * We look for count number of blocks in the buddy cache. If we were able * to locate that many free blocks we return with additional information * regarding rest of the contiguous physical block available * * Before allocating blocks via buddy cache we normalize the request * blocks. This ensure we ask for more blocks that we needed. The extra * blocks that we get after allocation is added to the respective prealloc * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the * the smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to * stripe size (sbi->s_stripe), we try to search for contiguous block in * stripe size. This should result in better allocation on RAID setups. If * not, we search in the specific group using bitmap for best extents. The * tunable min_to_scan and max_to_scan control the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best * extent and max_to_scan indicates how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * Both the prealloc space are getting populated as above. So for the first * request we will hit the buddy cache which will result in this prealloc * space getting filled. The prealloc space is then later used for the * subsequent request. */ /* * mballoc operates on the following data: * - on-disk bitmap * - in-core buddy (actually includes buddy and bitmap) * - preallocation descriptors (PAs) * * there are two types of preallocations: * - inode * assiged to specific inode and can be used for this inode only. * it describes part of inode's space preallocated to specific * physical blocks. any block from that preallocated can be used * independent. the descriptor just tracks number of blocks left * unused. so, before taking some block from descriptor, one must * make sure corresponded logical block isn't allocated yet. this * also means that freeing any block within descriptor's range * must discard all preallocated blocks. * - locality group * assigned to specific locality group which does not translate to * permanent set of inodes: inode can join and leave group. space * from this type of preallocation can be used for any inode. thus * it's consumed from the beginning to the end. * * relation between them can be expressed as: * in-core buddy = on-disk bitmap + preallocation descriptors * * this mean blocks mballoc considers used are: * - allocated blocks (persistent) * - preallocated blocks (non-persistent) * * consistency in mballoc world means that at any time a block is either * free or used in ALL structures. notice: "any time" should not be read * literally -- time is discrete and delimited by locks. * * to keep it simple, we don't use block numbers, instead we count number of * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. * * all operations can be expressed as: * - init buddy: buddy = on-disk + PAs * - new PA: buddy += N; PA = N * - use inode PA: on-disk += N; PA -= N * - discard inode PA buddy -= on-disk - PA; PA = 0 * - use locality group PA on-disk += N; PA -= N * - discard locality group PA buddy -= PA; PA = 0 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap * is used in real operation because we can't know actual used * bits from PA, only from on-disk bitmap * * if we follow this strict logic, then all operations above should be atomic. * given some of them can block, we'd have to use something like semaphores * killing performance on high-end SMP hardware. let's try to relax it using * the following knowledge: * 1) if buddy is referenced, it's already initialized * 2) while block is used in buddy and the buddy is referenced, * nobody can re-allocate that block * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has * bit set and PA claims same block, it's OK. IOW, one can set bit in * on-disk bitmap if buddy has same bit set or/and PA covers corresponded * block * * so, now we're building a concurrency table: * - init buddy vs. * - new PA * blocks for PA are allocated in the buddy, buddy must be referenced * until PA is linked to allocation group to avoid concurrent buddy init * - use inode PA * we need to make sure that either on-disk bitmap or PA has uptodate data * given (3) we care that PA-=N operation doesn't interfere with init * - discard inode PA * the simplest way would be to have buddy initialized by the discard * - use locality group PA * again PA-=N must be serialized with init * - discard locality group PA * the simplest way would be to have buddy initialized by the discard * - new PA vs. * - use inode PA * i_data_sem serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * some mutex should serialize them * - discard locality group PA * discard process must wait until PA isn't used by another process * - use inode PA * - use inode PA * i_data_sem or another mutex should serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * nothing wrong here -- they're different PAs covering different blocks * - discard locality group PA * discard process must wait until PA isn't used by another process * * now we're ready to make few consequences: * - PA is referenced and while it is no discard is possible * - PA is referenced until block isn't marked in on-disk bitmap * - PA changes only after on-disk bitmap * - discard must not compete with init. either init is done before * any discard or they're serialized somehow * - buddy init as sum of on-disk bitmap and PAs is done atomically * * a special case when we've used PA to emptiness. no need to modify buddy * in this case, but we should care about concurrent init * */ /* * Logic in few words: * * - allocation: * load group * find blocks * mark bits in on-disk bitmap * release group * * - use preallocation: * find proper PA (per-inode or group) * load group * mark bits in on-disk bitmap * release group * release PA * * - free: * load group * mark bits in on-disk bitmap * release group * * - discard preallocations in group: * mark PAs deleted * move them onto local list * load on-disk bitmap * load group * remove PA from object (inode or locality group) * mark free blocks in-core * * - discard inode's preallocations: */ /* * Locking rules * * Locks: * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) * * Paths: * - new pa * object * group * * - find and use pa: * pa * * - release consumed pa: * pa * group * object * * - generate in-core bitmap: * group * pa * * - discard all for given object (inode, locality group): * object * pa * group * * - discard all for given group: * group * pa * group * object * */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_free_data_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for * each unique s_blocksize_bits */ #define NR_GRPINFO_CACHES 8 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", "ext4_groupinfo_64k", "ext4_groupinfo_128k" }; static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { #if BITS_PER_LONG == 64 *bit += ((unsigned long) addr & 7UL) << 3; addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 *bit += ((unsigned long) addr & 3UL) << 3; addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif return addr; } static inline int mb_test_bit(int bit, void *addr) { /* * ext4_test_bit on architecture like powerpc * needs unsigned long aligned address */ addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_bit(bit, addr); } static inline void mb_set_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_set_bit(bit, addr); } static inline void mb_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } static inline int mb_test_and_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_and_clear_bit(bit, addr); } static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static inline int mb_find_next_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); if (order > e4b->bd_blkbits + 1) { *max = 0; return NULL; } /* at order 0 we see each particular block */ if (order == 0) { *max = 1 << (e4b->bd_blkbits + 3); return e4b->bd_bitmap; } bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; return bb; } #ifdef DOUBLE_CHECK static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int i; struct super_block *sb = e4b->bd_sb; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { int i; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); for (i = 0; i < count; i++) { BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); mb_set_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { unsigned char *b1, *b2; int i; b1 = (unsigned char *) e4b->bd_info->bb_bitmap; b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { ext4_msg(e4b->bd_sb, KERN_ERR, "corruption in group %u " "at byte %u(%u): %x in copy != %x " "on disk/prealloc", e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } } } #else static inline void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { return; } #endif #ifdef AGGRESSIVE_CHECK #define MB_CHECK_ASSERT(assert) \ do { \ if (!(assert)) { \ printk(KERN_EMERG \ "Assertion failure in %s() at %s:%d: \"%s\"\n", \ function, file, line, # assert); \ BUG(); \ } \ } while (0) static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; int order = e4b->bd_blkbits + 1; int max; int max2; int i; int j; int k; int count; struct ext4_group_info *grp; int fragments = 0; int fstart; struct list_head *cur; void *buddy; void *buddy2; { static int mb_check_counter; if (mb_check_counter++ % 100 != 0) return 0; } while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); MB_CHECK_ASSERT(buddy); buddy2 = mb_find_buddy(e4b, order - 1, &max2); MB_CHECK_ASSERT(buddy2); MB_CHECK_ASSERT(buddy != buddy2); MB_CHECK_ASSERT(max * 2 == max2); count = 0; for (i = 0; i < max; i++) { if (mb_test_bit(i, buddy)) { /* only single bit in buddy2 may be 1 */ if (!mb_test_bit(i << 1, buddy2)) { MB_CHECK_ASSERT( mb_test_bit((i<<1)+1, buddy2)); } else if (!mb_test_bit((i << 1) + 1, buddy2)) { MB_CHECK_ASSERT( mb_test_bit(i << 1, buddy2)); } continue; } /* both bits in buddy2 must be 1 */ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); for (j = 0; j < (1 << order); j++) { k = (i * (1 << order)) + j; MB_CHECK_ASSERT( !mb_test_bit(k, e4b->bd_bitmap)); } count++; } MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); order--; } fstart = -1; buddy = mb_find_buddy(e4b, 0, &max); for (i = 0; i < max; i++) { if (!mb_test_bit(i, buddy)) { MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); if (fstart == -1) { fragments++; fstart = i; } continue; } fstart = -1; /* check used bits only */ for (j = 0; j < e4b->bd_blkbits + 1; j++) { buddy2 = mb_find_buddy(e4b, j, &max2); k = i >> j; MB_CHECK_ASSERT(k < max2); MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); } } MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); MB_CHECK_ASSERT(groupnr == e4b->bd_group); for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } return 0; } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ __FILE__, __func__, __LINE__) #else #define mb_check_buddy(e4b) #endif /* * Divide blocks started from @first with length @len into * smaller chunks with power of 2 blocks. * Clear the bits in bitmap which the blocks of the chunk(s) covered, * then increase bb_counters[] for corresponded chunk size. */ static void ext4_mb_mark_free_simple(struct super_block *sb, void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; while (len > 0) { /* find how many blocks can be covered since this position */ max = ffs(first | border) - 1; /* find how many blocks of power 2 we need to mark */ min = fls(len) - 1; if (max < min) min = max; chunk = 1 << min; /* mark multiblock chunks only */ grp->bb_counters[min]++; if (min > 0) mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); len -= chunk; first += chunk; } } /* * Cache the order of the largest free extent we have available in this block * group. */ static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { int i; int bits; grp->bb_largest_free_order = -1; /* uninit */ bits = sb->s_blocksize_bits + 1; for (i = bits; i >= 0; i--) { if (grp->bb_counters[i] > 0) { grp->bb_largest_free_order = i; break; } } } static noinline_for_stack void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; ext4_grpblk_t len; unsigned free = 0; unsigned fragments = 0; unsigned long long period = get_cycles(); /* initialize buddy from bitmap which is aggregation * of on-disk bitmap and preallocations */ i = mb_find_next_zero_bit(bitmap, max, 0); grp->bb_first_free = i; while (i < max) { fragments++; first = i; i = mb_find_next_bit(bitmap, max, i); len = i - first; free += len; if (len > 1) ext4_mb_mark_free_simple(sb, buddy, first, len, grp); else grp->bb_counters[0]++; if (i < max) i = mb_find_next_zero_bit(bitmap, max, i); } grp->bb_fragments = fragments; if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, "block bitmap and bg descriptor " "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; spin_lock(&sbi->s_bal_lock); sbi->s_mb_buddies_generated++; sbi->s_mb_generation_time += period; spin_unlock(&sbi->s_bal_lock); } static void mb_regenerate_buddy(struct ext4_buddy *e4b) { int count; int order = 1; void *buddy; while ((buddy = mb_find_buddy(e4b, order++, &count))) { ext4_set_bits(buddy, 0, count); } e4b->bd_info->bb_fragments = 0; memset(e4b->bd_info->bb_counters, 0, sizeof(*e4b->bd_info->bb_counters) * (e4b->bd_sb->s_blocksize_bits + 2)); ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, e4b->bd_bitmap, e4b->bd_group); } /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are * stored in the inode as * * { page } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. * So for each group we take up 2 blocks. A page can * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. * So it can have information regarding groups_per_page which * is blocks_per_page/2 * * Locking note: This routine takes the block group lock of all groups * for this page; do not hold this lock when calling this routine! */ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) { ext4_group_t ngroups; int blocksize; int blocks_per_page; int groups_per_page; int err = 0; int i; ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; struct ext4_group_info *grinfo; mb_debug(1, "init page %lu\n", page->index); inode = page->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) groups_per_page = 1; /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, gfp); if (bh == NULL) { err = -ENOMEM; goto out; } } else bh = &bhs; first_group = page->index * blocks_per_page / 2; /* read all groups the page covers into the cache */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { if (group >= ngroups) break; grinfo = ext4_get_group_info(sb, group); /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so * we must skip all initialized uptodate buddies on the page, * which may be currently in use by an allocating task. */ if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } bh[i] = ext4_read_block_bitmap_nowait(sb, group); if (IS_ERR(bh[i])) { err = PTR_ERR(bh[i]); bh[i] = NULL; goto out; } mb_debug(1, "read bitmap for group %u\n", group); } /* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { int err2; if (!bh[i]) continue; err2 = ext4_wait_block_bitmap(sb, group, bh[i]); if (!err) err = err2; } first_block = page->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { group = (first_block + i) >> 1; if (group >= ngroups) break; if (!bh[group - first_group]) /* skip initialized uptodate buddy */ continue; if (!buffer_verified(bh[group - first_group])) /* Skip faulty bitmaps */ continue; err = 0; /* * data carry information regarding this * particular group in the format specified * above * */ data = page_address(page) + (i * blocksize); bitmap = bh[group - first_group]->b_data; /* * We place the buddy block and bitmap block * close together */ if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); mb_debug(1, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo = ext4_get_group_info(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * (sb->s_blocksize_bits+2)); /* * incore got set to the group block bitmap below */ ext4_lock_group(sb, group); /* init the buddy */ memset(data, 0xff, blocksize); ext4_mb_generate_buddy(sb, data, incore, group); ext4_unlock_group(sb, group); incore = NULL; } else { /* this is block of bitmap */ BUG_ON(incore != NULL); mb_debug(1, "put bitmap for group %u in page %lu/%x\n", group, page->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ ext4_lock_group(sb, group); memcpy(data, bitmap, blocksize); /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); ext4_mb_generate_from_freelist(sb, data, group); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be * generated using this */ incore = data; } } SetPageUptodate(page); out: if (bh) { for (i = 0; i < groups_per_page; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); } return err; } /* * Lock the buddy and bitmap pages. This make sure other parallel init_group * on the same buddy page doesn't happen whild holding the buddy page lock. * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap * are on the same page e4b->bd_buddy_page is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; int blocks_per_page; struct page *page; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, gfp); if (!page) return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); if (blocks_per_page >= 2) { /* buddy and bitmap are on the same page */ return 0; } block++; pnum = block / blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, gfp); if (!page) return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_buddy_page = page; return 0; } static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) { unlock_page(e4b->bd_bitmap_page); put_page(e4b->bd_bitmap_page); } if (e4b->bd_buddy_page) { unlock_page(e4b->bd_buddy_page); put_page(e4b->bd_buddy_page); } } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) { struct ext4_group_info *this_grp; struct ext4_buddy e4b; struct page *page; int ret = 0; might_sleep(); mb_debug(1, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); /* * This ensures that we don't reinit the buddy cache * page which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that * would have pinned buddy page to page cache. * The call to ext4_mb_get_buddy_page_lock will mark the * page accessed. */ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group * return without doing anything */ goto err; } page = e4b.bd_bitmap_page; ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) goto err; if (!PageUptodate(page)) { ret = -EIO; goto err; } if (e4b.bd_buddy_page == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force * init the buddy */ ret = 0; goto err; } /* init buddy cache */ page = e4b.bd_buddy_page; ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); if (ret) goto err; if (!PageUptodate(page)) { ret = -EIO; goto err; } err: ext4_mb_put_buddy_page_lock(&e4b); return ret; } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { int blocks_per_page; int block; int pnum; int poff; struct page *page; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; might_sleep(); mb_debug(1, "load group %u\n", group); blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* * we need full data about the group * to make a good selection */ ret = ext4_mb_init_group(sb, group, gfp); if (ret) return ret; } /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; /* we could use find_or_create_page(), but it locks page * what we'd like to avoid in fast path ... */ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) /* * drop the page reference and try * to get the page with lock. If we * are not uptodate that implies * somebody just created the page but * is yet to initialize the same. So * wait for it to initialize. */ put_page(page); page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) { unlock_page(page); goto err; } mb_cmp_bitmaps(e4b, page_address(page) + (poff * sb->s_blocksize)); } unlock_page(page); } } if (page == NULL) { ret = -ENOMEM; goto err; } if (!PageUptodate(page)) { ret = -EIO; goto err; } /* Pages marked accessed already */ e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) put_page(page); page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { BUG_ON(page->mapping != inode->i_mapping); if (!PageUptodate(page)) { ret = ext4_mb_init_cache(page, e4b->bd_bitmap, gfp); if (ret) { unlock_page(page); goto err; } } unlock_page(page); } } if (page == NULL) { ret = -ENOMEM; goto err; } if (!PageUptodate(page)) { ret = -EIO; goto err; } /* Pages marked accessed already */ e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); return 0; err: if (page) put_page(page); if (e4b->bd_bitmap_page) put_page(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) put_page(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; return ret; } static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) { return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); } static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_page) put_page(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) put_page(e4b->bd_buddy_page); } static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) { int order = 1; int bb_incr = 1 << (e4b->bd_blkbits - 1); void *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); bb = e4b->bd_buddy; while (order <= e4b->bd_blkbits + 1) { block = block >> 1; if (!mb_test_bit(block, bb)) { /* this block is part of buddy of order 'order' */ return order; } bb += bb_incr; bb_incr >>= 1; order++; } return 0; } static void mb_clear_bits(void *bm, int cur, int len) { __u32 *addr; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ addr = bm + (cur >> 3); *addr = 0; cur += 32; continue; } mb_clear_bit(cur, bm); cur++; } } /* clear bits in given range * will return first found zero bit if any, -1 otherwise */ static int mb_test_and_clear_bits(void *bm, int cur, int len) { __u32 *addr; int zero_bit = -1; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ addr = bm + (cur >> 3); if (*addr != (__u32)(-1) && zero_bit == -1) zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); *addr = 0; cur += 32; continue; } if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) zero_bit = cur; cur++; } return zero_bit; } void ext4_set_bits(void *bm, int cur, int len) { __u32 *addr; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: set whole word at once */ addr = bm + (cur >> 3); *addr = 0xffffffff; cur += 32; continue; } mb_set_bit(cur, bm); cur++; } } /* * _________________________________________________________________ */ static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) { if (mb_test_bit(*bit + side, bitmap)) { mb_clear_bit(*bit, bitmap); (*bit) -= side; return 1; } else { (*bit) += side; mb_set_bit(*bit, bitmap); return -1; } } static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) { int max; int order = 1; void *buddy = mb_find_buddy(e4b, order, &max); while (buddy) { void *buddy2; /* Bits in range [first; last] are known to be set since * corresponding blocks were allocated. Bits in range * (first; last) will stay set because they form buddies on * upper layer. We just deal with borders if they don't * align with upper layer and then go up. * Releasing entire group is all about clearing * single bit of highest order buddy. */ /* Example: * --------------------------------- * | 1 | 1 | 1 | 1 | * --------------------------------- * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | * --------------------------------- * 0 1 2 3 4 5 6 7 * \_____________________/ * * Neither [1] nor [6] is aligned to above layer. * Left neighbour [0] is free, so mark it busy, * decrease bb_counters and extend range to * [0; 6] * Right neighbour [7] is busy. It can't be coaleasced with [6], so * mark [6] free, increase bb_counters and shrink range to * [0; 5]. * Then shift range to [0; 2], go up and do the same. */ if (first & 1) e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); if (!(last & 1)) e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); if (first > last) break; order++; if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { mb_clear_bits(buddy, first, last - first + 1); e4b->bd_info->bb_counters[order - 1] += last - first + 1; break; } first >>= 1; last >>= 1; buddy = buddy2; } } static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int left_is_free = 0; int right_is_free = 0; int block; int last = first + count - 1; struct super_block *sb = e4b->bd_sb; if (WARN_ON(count == 0)) return; BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); /* Don't bother if the block group is corrupt. */ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) return; mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); e4b->bd_info->bb_free += count; if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ if (first != 0) left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); mb_regenerate_buddy(e4b); goto done; } /* let's maintain fragments counter */ if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; else if (!left_is_free && !right_is_free) e4b->bd_info->bb_fragments++; /* buddy[0] == bd_bitmap is a special case, so handle * it right away and let mb_buddy_mark_free stay free of * zero order checks. * Check if neighbours are to be coaleasced, * adjust bitmap bb_counters and borders appropriately. */ if (first & 1) { first += !left_is_free; e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; } if (!(last & 1)) { last -= !right_is_free; e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; } if (first <= last) mb_buddy_mark_free(e4b, first >> 1, last >> 1); done: mb_set_largest_free_order(sb, e4b->bd_info); mb_check_buddy(e4b); } static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { int next = block; int max, order; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); buddy = mb_find_buddy(e4b, 0, &max); BUG_ON(buddy == NULL); BUG_ON(block >= max); if (mb_test_bit(block, buddy)) { ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; return 0; } /* find actual order */ order = mb_find_order_for_block(e4b, block); block = block >> order; ex->fe_len = 1 << order; ex->fe_start = block << order; ex->fe_group = e4b->bd_group; /* calc difference from given start */ next = next - ex->fe_start; ex->fe_len -= next; ex->fe_start += next; while (needed > ex->fe_len && mb_find_buddy(e4b, order, &max)) { if (block + 1 >= max) break; next = (block + 1) * (1 << order); if (mb_test_bit(next, e4b->bd_bitmap)) break; order = mb_find_order_for_block(e4b, next); block = next >> order; ex->fe_len += 1 << order; } if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { /* Should never happen! (but apparently sometimes does?!?) */ WARN_ON(1); ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, "corruption or bug in mb_find_extent " "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", block, order, needed, ex->fe_group, ex->fe_start, ex->fe_len, ex->fe_logical); ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; } return ex->fe_len; } static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) { int ord; int mlen = 0; int max = 0; int cur; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); e4b->bd_info->bb_free -= len; if (e4b->bd_info->bb_first_free == start) e4b->bd_info->bb_first_free += len; /* let's maintain fragments counter */ if (start != 0) mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) max = !mb_test_bit(start + len, e4b->bd_bitmap); if (mlen && max) e4b->bd_info->bb_fragments++; else if (!mlen && !max) e4b->bd_info->bb_fragments--; /* let's maintain buddy itself */ while (len) { ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; start += mlen; len -= mlen; BUG_ON(len < 0); continue; } /* store for history */ if (ret == 0) ret = len | (ord << 16); /* we have to split large buddy */ BUG_ON(ord <= 0); buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; ord--; cur = (start >> ord) & ~1U; buddy = mb_find_buddy(e4b, ord, &max); mb_clear_bit(cur, buddy); mb_clear_bit(cur + 1, buddy); e4b->bd_info->bb_counters[ord]++; e4b->bd_info->bb_counters[ord]++; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; } /* * Must be called under group lock! */ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int ret; BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); BUG_ON(ac->ac_status == AC_STATUS_FOUND); ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; ret = mb_mark_used(e4b, &ac->ac_b_ex); /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ ac->ac_f_ex = ac->ac_b_ex; ac->ac_status = AC_STATUS_FOUND; ac->ac_tail = ret & 0xffff; ac->ac_buddy = ret >> 16; /* * take the page reference. We want the page to be pinned * so that we don't get a ext4_mb_init_cache_call for this * group until we update the bitmap. That would mean we * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ ac->ac_bitmap_page = e4b->bd_bitmap_page; get_page(ac->ac_bitmap_page); ac->ac_buddy_page = e4b->bd_buddy_page; get_page(ac->ac_buddy_page); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); sbi->s_mb_last_group = ac->ac_f_ex.fe_group; sbi->s_mb_last_start = ac->ac_f_ex.fe_start; spin_unlock(&sbi->s_md_lock); } } /* * regular allocator, for general purposes allocation */ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_buddy *e4b, int finish_group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; struct ext4_free_extent ex; int max; if (ac->ac_status == AC_STATUS_FOUND) return; /* * We don't want to scan for a whole year */ if (ac->ac_found > sbi->s_mb_max_to_scan && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { ac->ac_status = AC_STATUS_BREAK; return; } /* * Haven't found good chunk so far, let's continue */ if (bex->fe_len < gex->fe_len) return; if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) && bex->fe_group == e4b->bd_group) { /* recheck chunk's availability - we don't know * when it was found (within this lock-unlock * period or not) */ max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); if (max >= gex->fe_len) { ext4_mb_use_best_found(ac, e4b); return; } } } /* * The routine checks whether found extent is good enough. If it is, * then the extent gets marked used and flag is set to the context * to stop scanning. Otherwise, the extent is compared with the * previous found extent and if new one is better, then it's stored * in the context. Later, the best found extent will be used, if * mballoc can't find good enough extent. * * FIXME: real allocation policy is to be designed yet! */ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *ex, struct ext4_buddy *e4b) { struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; /* * The special case - take what you catch first */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * Let's check whether the chuck is good enough */ if (ex->fe_len == gex->fe_len) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * If this is first found extent, just store it in the context */ if (bex->fe_len == 0) { *bex = *ex; return; } /* * If new found extent is better, store it in the context */ if (bex->fe_len < gex->fe_len) { /* if the request isn't satisfied, any found extent * larger than previous best one is better */ if (ex->fe_len > bex->fe_len) *bex = *ex; } else if (ex->fe_len > gex->fe_len) { /* if the request is satisfied, then we try to find * an extent that still satisfy the request, but is * smaller than previous one */ if (ex->fe_len < bex->fe_len) *bex = *ex; } ext4_mb_check_limits(ac, e4b, 0); } static noinline_for_stack int ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_free_extent ex = ac->ac_b_ex; ext4_group_t group = ex.fe_group; int max; int err; BUG_ON(ex.fe_len <= 0); err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return err; ext4_lock_group(ac->ac_sb, group); max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); return 0; } static noinline_for_stack int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { ext4_group_t group = ac->ac_g_ex.fe_group; int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) return 0; if (grp->bb_free == 0) return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return err; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) { ext4_mb_unload_buddy(e4b); return 0; } ext4_lock_group(ac->ac_sb, group); max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + ex.fe_start; /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } } else if (max >= ac->ac_g_ex.fe_len) { BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { /* Sometimes, caller may want to merge even small * number of blocks to an existing extent */ BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); return 0; } /* * The routine scans buddy structures (not bitmap!) from given order * to max order and tries to find big enough chunk to satisfy the req */ static noinline_for_stack void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_group_info *grp = e4b->bd_info; void *buddy; int i; int k; int max; BUG_ON(ac->ac_2order <= 0); for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { if (grp->bb_counters[i] == 0) continue; buddy = mb_find_buddy(e4b, i, &max); BUG_ON(buddy == NULL); k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, "%d free clusters of order %d. But found 0", grp->bb_counters[i], i); ext4_mark_group_bitmap_corrupted(ac->ac_sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } ac->ac_found++; ac->ac_b_ex.fe_len = 1 << i; ac->ac_b_ex.fe_start = k << i; ac->ac_b_ex.fe_group = e4b->bd_group; ext4_mb_use_best_found(ac, e4b); BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); if (EXT4_SB(sb)->s_mb_stats) atomic_inc(&EXT4_SB(sb)->s_bal_2orders); break; } } /* * The routine scans the group and measures all found extents. * In order to optimize scanning, caller must pass number of * free blocks in the group, so the routine can know upper limit. */ static noinline_for_stack void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i; int free; free = e4b->bd_info->bb_free; if (WARN_ON(free <= 0)) return; i = e4b->bd_info->bb_first_free; while (free && ac->ac_status == AC_STATUS_CONTINUE) { i = mb_find_next_zero_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * we have free blocks */ ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); if (WARN_ON(ex.fe_len <= 0)) break; if (free < ex.fe_len) { ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit * without claiming the space. */ break; } ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; free -= ex.fe_len; } ext4_mb_check_limits(ac, e4b, 1); } /* * This is a special case for storages like raid5 * we try to find stripe-aligned chunks for stripe-size-multiple requests */ static noinline_for_stack void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; ext4_grpblk_t i; int max; BUG_ON(sbi->s_stripe == 0); /* find first stripe-aligned block in group */ first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); a = first_group_block + sbi->s_stripe - 1; do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { ac->ac_found++; ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; } } i += sbi->s_stripe; } } /* * This is now called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable * for the allocation or not. In addition it can also return negative * error code when something goes wrong. */ static int ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { unsigned free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < 0 || cr >= 4); free = grp->bb_free; if (free == 0) return 0; if (cr <= 2 && free < ac->ac_g_ex.fe_len) return 0; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return 0; /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); if (ret) return ret; } fragments = grp->bb_fragments; if (fragments == 0) return 0; switch (cr) { case 0: BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) return 0; if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || (free / fragments) >= ac->ac_g_ex.fe_len) return 1; if (grp->bb_largest_free_order < ac->ac_2order) return 0; return 1; case 1: if ((free / fragments) >= ac->ac_g_ex.fe_len) return 1; break; case 2: if (free >= ac->ac_g_ex.fe_len) return 1; break; case 3: return 1; default: BUG(); } return 0; } static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t ngroups, group, i; int cr; int err = 0, first_err = 0; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; sb = ac->ac_sb; sbi = EXT4_SB(sb); ngroups = ext4_get_groups_count(sb); /* non-extent files are limited to low blocks/groups */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) ngroups = sbi->s_blockfile_groups; BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ err = ext4_mb_find_by_goal(ac, &e4b); if (err || ac->ac_status == AC_STATUS_FOUND) goto out; if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) goto out; /* * ac->ac2_order is set only if the fe_len is a power of 2 * if ac2_order is set we also set criteria to 0 so that we * try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; /* * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req * We also support searching for power-of-two requests only for * requests upto maximum buddy size we have constructed. */ if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) { /* * This should tell if fe_len is exactly power of 2 */ if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) ac->ac_2order = array_index_nospec(i - 1, sb->s_blocksize_bits + 2); } /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { /* TBD: may be hot point */ spin_lock(&sbi->s_md_lock); ac->ac_g_ex.fe_group = sbi->s_mb_last_group; ac->ac_g_ex.fe_start = sbi->s_mb_last_start; spin_unlock(&sbi->s_md_lock); } /* Let's just scan groups to find more-less suitable blocks */ cr = ac->ac_2order ? 0 : 1; /* * cr == 0 try to get exact allocation, * cr == 3 try to get anything */ repeat: for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; /* * searching for the right group start * from the goal value specified */ group = ac->ac_g_ex.fe_group; for (i = 0; i < ngroups; group++, i++) { int ret = 0; cond_resched(); /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. */ if (group >= ngroups) group = 0; /* This now checks without needing the buddy page */ ret = ext4_mb_good_group(ac, group, cr); if (ret <= 0) { if (!first_err) first_err = ret; continue; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) goto out; ext4_lock_group(sb, group); /* * We need to check again after locking the * block group */ ret = ext4_mb_good_group(ac, group, cr); if (ret <= 0) { ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); if (!first_err) first_err = ret; continue; } ac->ac_groups_scanned++; if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && sbi->s_stripe && !(ac->ac_g_ex.fe_len % sbi->s_stripe)) ext4_mb_scan_aligned(ac, &e4b); else ext4_mb_complex_scan_group(ac, &e4b); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); if (ac->ac_status != AC_STATUS_CONTINUE) break; } } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { /* * We've been searching too long. Let's try to allocate * the best chunk we've found so far */ ext4_mb_try_best_found(ac, &e4b); if (ac->ac_status != AC_STATUS_FOUND) { /* * Someone more lucky has already allocated it. * The only thing we can do is just take first * found block(s) printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); */ ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; cr = 3; atomic_inc(&sbi->s_mb_lost_chunks); goto repeat; } } out: if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; return err; } static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = PDE_DATA(file_inode(seq->file)); ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); } static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = PDE_DATA(file_inode(seq->file)); ext4_group_t group; ++*pos; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); } static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = PDE_DATA(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; int err, buddy_loaded = 0; struct ext4_buddy e4b; struct ext4_group_info *grinfo; unsigned char blocksize_bits = min_t(unsigned char, sb->s_blocksize_bits, EXT4_MAX_BLOCK_LOG_SIZE); struct sg { struct ext4_group_info info; ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; } sg; group--; if (group == 0) seq_puts(seq, "#group: free frags first [" " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { seq_printf(seq, "#%-5u: I/O error\n", group); return 0; } buddy_loaded = 1; } memcpy(&sg, ext4_get_group_info(sb, group), i); if (buddy_loaded) ext4_mb_unload_buddy(&e4b); seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? sg.info.bb_counters[i] : 0); seq_printf(seq, " ]\n"); return 0; } static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) { } const struct seq_operations ext4_mb_seq_groups_ops = { .start = ext4_mb_seq_groups_start, .next = ext4_mb_seq_groups_next, .stop = ext4_mb_seq_groups_stop, .show = ext4_mb_seq_groups_show, }; static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; BUG_ON(!cachep); return cachep; } /* * Allocate the top-level s_group_info array for the specified number * of groups */ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned size; struct ext4_group_info ***old_groupinfo, ***new_groupinfo; size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); if (size <= sbi->s_group_info_size) return 0; size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); new_groupinfo = kvzalloc(size, GFP_KERNEL); if (!new_groupinfo) { ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; } rcu_read_lock(); old_groupinfo = rcu_dereference(sbi->s_group_info); if (old_groupinfo) memcpy(new_groupinfo, old_groupinfo, sbi->s_group_info_size * sizeof(*sbi->s_group_info)); rcu_read_unlock(); rcu_assign_pointer(sbi->s_group_info, new_groupinfo); sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); if (old_groupinfo) ext4_kvfree_array_rcu(old_groupinfo); ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", sbi->s_group_info_size); return 0; } /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) { int i; int metalen = 0; int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info **meta_group_info; struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); /* * First check if this group is the first of a reserved block. * If it's true, we have to allocate a new table of pointers * to ext4_group_info structures */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_NOFS); if (meta_group_info == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); goto exit_meta_group_info; } rcu_read_lock(); rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; rcu_read_unlock(); } meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx); i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); if (meta_group_info[i] == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(meta_group_info[i]->bb_state)); /* * initialize bb_free to be able to skip * empty groups without initialization */ if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { meta_group_info[i]->bb_free = ext4_free_clusters_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = ext4_free_group_clusters(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ #ifdef DOUBLE_CHECK { struct buffer_head *bh; meta_group_info[i]->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); BUG_ON(meta_group_info[i]->bb_bitmap == NULL); bh = ext4_read_block_bitmap(sb, group); BUG_ON(IS_ERR_OR_NULL(bh)); memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, sb->s_blocksize); put_bh(bh); } #endif return 0; exit_group_info: /* If a meta_group_info table has been allocated, release it now */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { struct ext4_group_info ***group_info; rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); kfree(group_info[idx]); group_info[idx] = NULL; rcu_read_unlock(); } exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ static int ext4_mb_init_backend(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; struct ext4_group_desc *desc; struct ext4_group_info ***group_info; struct kmem_cache *cachep; err = ext4_mb_alloc_groupinfo(sb, ngroups); if (err) return err; sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } /* To avoid potentially colliding with an valid on-disk inode number, * use EXT4_BAD_INO for the buddy cache inode number. This inode is * not in the inode hash, so it should never be found by iget(), but * this will avoid confusion if it ever shows up during debugging. */ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) goto err_freebuddy; } return 0; err_freebuddy: cachep = get_groupinfo_cache(sb->s_blocksize_bits); while (i-- > 0) kmem_cache_free(cachep, ext4_get_group_info(sb, i)); i = sbi->s_group_info_size; rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); while (i-- > 0) kfree(group_info[i]); rcu_read_unlock(); iput(sbi->s_buddy_cache); err_freesgi: rcu_read_lock(); kvfree(rcu_dereference(sbi->s_group_info)); rcu_read_unlock(); return -ENOMEM; } static void ext4_groupinfo_destroy_slabs(void) { int i; for (i = 0; i < NR_GRPINFO_CACHES; i++) { kmem_cache_destroy(ext4_groupinfo_caches[i]); ext4_groupinfo_caches[i] = NULL; } } static int ext4_groupinfo_create_slab(size_t size) { static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); int slab_size; int blocksize_bits = order_base_2(size); int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; struct kmem_cache *cachep; if (cache_index >= NR_GRPINFO_CACHES) return -EINVAL; if (unlikely(cache_index < 0)) cache_index = 0; mutex_lock(&ext4_grpinfo_slab_create_mutex); if (ext4_groupinfo_caches[cache_index]) { mutex_unlock(&ext4_grpinfo_slab_create_mutex); return 0; /* Already created */ } slab_size = offsetof(struct ext4_group_info, bb_counters[blocksize_bits + 2]); cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], slab_size, 0, SLAB_RECLAIM_ACCOUNT, NULL); ext4_groupinfo_caches[cache_index] = cachep; mutex_unlock(&ext4_grpinfo_slab_create_mutex); if (!cachep) { printk(KERN_EMERG "EXT4-fs: no memory for groupinfo slab cache\n"); return -ENOMEM; } return 0; } int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; unsigned offset, offset_incr; unsigned max; int ret; i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { ret = -ENOMEM; goto out; } i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { ret = -ENOMEM; goto out; } ret = ext4_groupinfo_create_slab(sb->s_blocksize); if (ret < 0) goto out; /* order 0 is regular bitmap */ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; sbi->s_mb_offsets[0] = 0; i = 1; offset = 0; offset_incr = 1 << (sb->s_blocksize_bits - 1); max = sb->s_blocksize << 2; do { sbi->s_mb_offsets[i] = offset; sbi->s_mb_maxs[i] = max; offset += offset_incr; offset_incr = offset_incr >> 1; max = max >> 1; i++; } while (i <= sb->s_blocksize_bits + 1); spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_free_pending = 0; INIT_LIST_HEAD(&sbi->s_freed_data_list); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file * systems, this is probably too big (i.e, if the cluster size * is 1 megabyte, then group preallocation size becomes half a * gigabyte!). As a default, we will keep a two megabyte * group pralloc size for cluster sizes up to 64k, and after * that, we will force a minimum group preallocation size of * 32 clusters. This translates to 8 megs when the cluster * size is 256k, and 32 megs when the cluster size is 1 meg, * which seems reasonable as a default. */ sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> sbi->s_cluster_bits, 32); /* * If there is a s_stripe > 1, then we set the s_mb_group_prealloc * to the lowest multiple of s_stripe which is bigger than * the s_mb_group_prealloc as determined above. We want * the preallocation size to be an exact multiple of the * RAID stripe size so that preallocations don't fragment * the stripes. */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( sbi->s_mb_group_prealloc, sbi->s_stripe); } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; goto out; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; lg = per_cpu_ptr(sbi->s_locality_groups, i); mutex_init(&lg->lg_mutex); for (j = 0; j < PREALLOC_TB_SIZE; j++) INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); spin_lock_init(&lg->lg_prealloc_lock); } /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) goto out_free_locality_groups; return 0; out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out: kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); sbi->s_mb_maxs = NULL; return ret; } /* need to called with the ext4 group lock held */ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; struct list_head *cur, *tmp; int count = 0; list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; kmem_cache_free(ext4_pspace_cachep, pa); } if (count) mb_debug(1, "mballoc: %u PAs left\n", count); } int ext4_mb_release(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int num_meta_group_infos; struct ext4_group_info *grinfo, ***group_info; struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { grinfo = ext4_get_group_info(sb, i); #ifdef DOUBLE_CHECK kfree(grinfo->bb_bitmap); #endif ext4_lock_group(sb, i); ext4_mb_cleanup_pa(grinfo); ext4_unlock_group(sb, i); kmem_cache_free(cachep, grinfo); } num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); for (i = 0; i < num_meta_group_infos; i++) kfree(group_info[i]); kvfree(group_info); rcu_read_unlock(); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { ext4_msg(sb, KERN_INFO, "mballoc: %u blocks %u reqs (%u success)", atomic_read(&sbi->s_bal_allocated), atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); ext4_msg(sb, KERN_INFO, "mballoc: %u extents scanned, %u goal hits, " "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); ext4_msg(sb, KERN_INFO, "mballoc: %lu generated and it took %Lu", sbi->s_mb_buddies_generated, sbi->s_mb_generation_time); ext4_msg(sb, KERN_INFO, "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), atomic_read(&sbi->s_mb_discarded)); } free_percpu(sbi->s_locality_groups); return 0; } static inline int ext4_issue_discard(struct super_block *sb, ext4_group_t block_group, ext4_grpblk_t cluster, int count, struct bio **biop) { ext4_fsblk_t discard_block; discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + ext4_group_first_block_no(sb, block_group)); count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); if (biop) { return __blkdev_issue_discard(sb->s_bdev, (sector_t)discard_block << (sb->s_blocksize_bits - 9), (sector_t)count << (sb->s_blocksize_bits - 9), GFP_NOFS, 0, biop); } else return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } static void ext4_free_data_in_buddy(struct super_block *sb, struct ext4_free_data *entry) { struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0, count2 = 0; mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); spin_lock(&EXT4_SB(sb)->s_md_lock); EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; spin_unlock(&EXT4_SB(sb)->s_md_lock); db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ count += entry->efd_count; count2++; ext4_lock_group(sb, entry->efd_group); /* Take it out of per group rb tree */ rb_erase(&entry->efd_node, &(db->bb_free_root)); mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); /* * Clear the trimmed flag for the group so that the next * ext4_trim_fs can trim it. * If the volume is mounted with -o discard, online discard * is supported and the free blocks will be trimmed online. */ if (!test_opt(sb, DISCARD)) EXT4_MB_GRP_CLEAR_TRIMMED(db); if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ put_page(e4b.bd_buddy_page); put_page(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->efd_group); kmem_cache_free(ext4_free_data_cachep, entry); ext4_mb_unload_buddy(&e4b); mb_debug(1, "freed %u blocks in %u structures\n", count, count2); } /* * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. */ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; struct bio *discard_bio = NULL; struct list_head freed_data_list; struct list_head *cut_pos = NULL; int err; INIT_LIST_HEAD(&freed_data_list); spin_lock(&sbi->s_md_lock); list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { if (entry->efd_tid != commit_tid) break; cut_pos = &entry->efd_list; } if (cut_pos) list_cut_position(&freed_data_list, &sbi->s_freed_data_list, cut_pos); spin_unlock(&sbi->s_md_lock); if (test_opt(sb, DISCARD)) { list_for_each_entry(entry, &freed_data_list, efd_list) { err = ext4_issue_discard(sb, entry->efd_group, entry->efd_start_cluster, entry->efd_count, &discard_bio); if (err && err != -EOPNOTSUPP) { ext4_msg(sb, KERN_WARNING, "discard request in" " group:%d block:%d count:%d failed" " with %d", entry->efd_group, entry->efd_start_cluster, entry->efd_count, err); } else if (err == -EOPNOTSUPP) break; } if (discard_bio) { submit_bio_wait(discard_bio); bio_put(discard_bio); } } list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) ext4_free_data_in_buddy(sb, entry); } int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) return -ENOMEM; ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, SLAB_RECLAIM_ACCOUNT); if (ext4_ac_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT); if (ext4_free_data_cachep == NULL) { kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); return -ENOMEM; } return 0; } void ext4_exit_mballoc(void) { /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. */ rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); } /* * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps * Returns 0 if success or error code */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_clstrs) { struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; struct buffer_head *gdp_bh; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block; int err, len; BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_b_ex.fe_len <= 0); sb = ac->ac_sb; sbi = EXT4_SB(sb); bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); bitmap_bh = NULL; goto out_err; } BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, bitmap_bh); if (err) goto out_err; err = -EIO; gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); if (!gdp) goto out_err; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); BUFFER_TRACE(gdp_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdp_bh); if (err) goto out_err; block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ ext4_lock_group(sb, ac->ac_b_ex.fe_group); ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) err = -EFSCORRUPTED; goto out_err; } ext4_lock_group(sb, ac->ac_b_ex.fe_group); #ifdef AGGRESSIVE_CHECK { int i; for (i = 0; i < ac->ac_b_ex.fe_len; i++) { BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, bitmap_bh->b_data)); } } #endif ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, ac->ac_b_ex.fe_group, gdp)); } len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; ext4_free_group_clusters_set(sb, gdp, len); ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative */ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); atomic64_sub(ac->ac_b_ex.fe_len, &sbi_array_rcu_deref(sbi, s_flex_groups, flex_group)->free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (err) goto out_err; err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); out_err: brelse(bitmap_bh); return err; } /* * here we normalize request for locality group * Group request are normalized to s_mb_group_prealloc, which goes to * s_strip if we set the same via mount option. * s_mb_group_prealloc can be configured via * /sys/fs/ext4/<partition>/mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? */ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; mb_debug(1, "#%u: goal %u blocks for locality group\n", current->pid, ac->ac_g_ex.fe_len); } /* * Normalization means making request better in terms of * size and alignment */ static noinline_for_stack void ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits, max; ext4_lblk_t end; loff_t size, start_off; loff_t orig_size __maybe_unused; ext4_lblk_t start; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *pa; /* do normalize only data requests, metadata requests do not need preallocation */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; /* sometime caller may want exact blocks */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; /* caller may indicate that preallocation isn't * required (it's a tail, for example) */ if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) return; if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { ext4_mb_normalize_group_request(ac); return ; } bsbits = ac->ac_sb->s_blocksize_bits; /* first, let's learn actual file size * given current request is allocated */ size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); orig_size = size; /* max size of free chunks */ max = 2 << bsbits; #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ (req <= (size) || max <= (chunk_size)) /* first, try to predict filesize */ /* XXX: should this table be tunable? */ start_off = 0; if (size <= 16 * 1024) { size = 16 * 1024; } else if (size <= 32 * 1024) { size = 32 * 1024; } else if (size <= 64 * 1024) { size = 64 * 1024; } else if (size <= 128 * 1024) { size = 128 * 1024; } else if (size <= 256 * 1024) { size = 256 * 1024; } else if (size <= 512 * 1024) { size = 512 * 1024; } else if (size <= 1024 * 1024) { size = 1024 * 1024; } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (21 - bsbits)) << 21; size = 2 * 1024 * 1024; } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (22 - bsbits)) << 22; size = 4 * 1024 * 1024; } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, (8<<20)>>bsbits, max, 8 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (23 - bsbits)) << 23; size = 8 * 1024 * 1024; } else { start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), ac->ac_o_ex.fe_len) << bsbits; } size = size >> bsbits; start = start_off >> bsbits; /* * For tiny groups (smaller than 8MB) the chosen allocation * alignment may be larger than group size. Make sure the * alignment does not move allocation to a different group which * makes mballoc fail assertions later. */ start = max(start, rounddown(ac->ac_o_ex.fe_logical, (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { size -= ar->lleft + 1 - start; start = ar->lleft + 1; } if (ar->pright && start + size - 1 >= ar->lright) size -= start + size - ar->lright; /* * Trim allocation request for filesystems with artificially small * groups. */ if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); end = start + size; /* check we don't cross already preallocated blocks */ rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { ext4_lblk_t pa_end; if (pa->pa_deleted) continue; spin_lock(&pa->pa_lock); if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), pa->pa_len); /* PA must not overlap original request */ BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || ac->ac_o_ex.fe_logical < pa->pa_lstart)); /* skip PAs this normalized request doesn't overlap with */ if (pa->pa_lstart >= end || pa_end <= start) { spin_unlock(&pa->pa_lock); continue; } BUG_ON(pa->pa_lstart <= start && pa_end >= end); /* adjust start or end to be adjacent to this pa */ if (pa_end <= ac->ac_o_ex.fe_logical) { BUG_ON(pa_end < start); start = pa_end; } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { BUG_ON(pa->pa_lstart > end); end = pa->pa_lstart; } spin_unlock(&pa->pa_lock); } rcu_read_unlock(); size = end - start; /* XXX: extra loop to check we really don't overlap preallocations */ rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { ext4_lblk_t pa_end; spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0) { pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), pa->pa_len); BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); } spin_unlock(&pa->pa_lock); } rcu_read_unlock(); if (start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical) { ext4_msg(ac->ac_sb, KERN_ERR, "start %lu, size %lu, fe_logical %lu", (unsigned long) start, (unsigned long) size, (unsigned long) ac->ac_o_ex.fe_logical); BUG(); } BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ /* XXX: is it better to align blocks WRT to logical * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size))) { /* merge to the right */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, &ac->ac_f_ex.fe_group, &ac->ac_f_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } if (ar->pleft && (ar->lleft + 1 == start)) { /* merge to the left */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, &ac->ac_f_ex.fe_group, &ac->ac_f_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, (unsigned) orig_size, (unsigned) start); } static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); if (ac->ac_found > sbi->s_mb_max_to_scan) atomic_inc(&sbi->s_bal_breaks); } if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) trace_ext4_mballoc_alloc(ac); else trace_ext4_mballoc_prealloc(ac); } /* * Called on failure; free up any blocks from the inode PA for this * context. We don't need this for MB_GROUP_PA because we only change * pa_free in ext4_mb_release_context(), but on failure, we've already * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. */ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; struct ext4_buddy e4b; int err; if (pa == NULL) { if (ac->ac_f_ex.fe_len == 0) return; err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); if (err) { /* * This should never happen since we pin the * pages in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ WARN(1, "mb_load_buddy failed (%d)", err); return; } ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, ac->ac_f_ex.fe_len); ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); ext4_mb_unload_buddy(&e4b); return; } if (pa->pa_type == MB_INODE_PA) pa->pa_free += ac->ac_b_ex.fe_len; } /* * use blocks preallocated to inode */ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; ext4_fsblk_t end; int len; /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); len = EXT4_NUM_B2C(sbi, end - start); ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; BUG_ON(start < pa->pa_pstart); BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); pa->pa_free -= len; mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); } /* * use blocks preallocated to locality group */ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { unsigned int len = ac->ac_o_ex.fe_len; ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; /* we don't correct pa_pstart or pa_plen here to avoid * possible race when the group is being loaded concurrently * instead we correct pa later, after blocks are marked * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); } /* * Return the prealloc space that have minimal distance * from the goal block. @cpa is the prealloc * space that is having currently known minimal distance * from the goal block. */ static struct ext4_prealloc_space * ext4_mb_check_group_pa(ext4_fsblk_t goal_block, struct ext4_prealloc_space *pa, struct ext4_prealloc_space *cpa) { ext4_fsblk_t cur_distance, new_distance; if (cpa == NULL) { atomic_inc(&pa->pa_count); return pa; } cur_distance = abs(goal_block - cpa->pa_pstart); new_distance = abs(goal_block - pa->pa_pstart); if (cur_distance <= new_distance) return cpa; /* drop the previous reference */ atomic_dec(&cpa->pa_count); atomic_inc(&pa->pa_count); return pa; } /* * search goal blocks in preallocated space */ static noinline_for_stack int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *pa, *cpa = NULL; ext4_fsblk_t goal_block; /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return 0; /* first, try per-file preallocation */ rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { /* all fields in this condition don't change, * so we can skip locking for them */ if (ac->ac_o_ex.fe_logical < pa->pa_lstart || ac->ac_o_ex.fe_logical >= (pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len))) continue; /* non-extent files can't have physical blocks past 2^32 */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > EXT4_MAX_BLOCK_FILE_PHYS)) continue; /* found preallocated blocks, use them */ spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0 && pa->pa_free) { atomic_inc(&pa->pa_count); ext4_mb_use_inode_pa(ac, pa); spin_unlock(&pa->pa_lock); ac->ac_criteria = 10; rcu_read_unlock(); return 1; } spin_unlock(&pa->pa_lock); } rcu_read_unlock(); /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) return 0; /* inode may have no locality group for some reason */ lg = ac->ac_lg; if (lg == NULL) return 0; order = fls(ac->ac_o_ex.fe_len) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); /* * search for the prealloc space that is having * minimal distance from the goal block. */ for (i = order; i < PREALLOC_TB_SIZE; i++) { rcu_read_lock(); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], pa_inode_list) { spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { cpa = ext4_mb_check_group_pa(goal_block, pa, cpa); } spin_unlock(&pa->pa_lock); } rcu_read_unlock(); } if (cpa) { ext4_mb_use_group_pa(ac, cpa); ac->ac_criteria = 20; return 1; } return 0; } /* * the function goes through all block freed in the group * but not yet committed and marks them used in in-core bitmap. * buddy must be generated from this bitmap * Need to be called with the ext4 group lock held */ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group) { struct rb_node *n; struct ext4_group_info *grp; struct ext4_free_data *entry; grp = ext4_get_group_info(sb, group); n = rb_first(&(grp->bb_free_root)); while (n) { entry = rb_entry(n, struct ext4_free_data, efd_node); ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); n = rb_next(n); } return; } /* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap * Need to be called with ext4 group lock held */ static noinline_for_stack void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_prealloc_space *pa; struct list_head *cur; ext4_group_t groupnr; ext4_grpblk_t start; int preallocated = 0; int len; /* all form of preallocation discards first load group, * so the only competing code is preallocation use. * we don't need any locking here * notice we do NOT ignore preallocations with pa_deleted * otherwise we could leave used blocks available for * allocation in buddy when concurrent ext4_mb_put_pa() * is dropping preallocation */ list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); spin_lock(&pa->pa_lock); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &start); len = pa->pa_len; spin_unlock(&pa->pa_lock); if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); ext4_set_bits(bitmap, start, len); preallocated += len; } mb_debug(1, "preallocated %u for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); BUG_ON(atomic_read(&pa->pa_count)); BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } /* * drops a reference to preallocated space descriptor * if this was the last reference and the space is consumed */ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { ext4_group_t grp; ext4_fsblk_t grp_blk; /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { spin_unlock(&pa->pa_lock); return; } if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; } pa->pa_deleted = 1; spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; /* * If doing group-based preallocation, pa_pstart may be in the * next group when pa is used up */ if (pa->pa_type == MB_GROUP_PA) grp_blk--; grp = ext4_get_group_number(sb, grp_blk); /* * possible race: * * P1 (buddy init) P2 (regular allocation) * find block B in PA * copy on-disk bitmap to buddy * mark B in on-disk bitmap * drop PA from group * mark all PAs in buddy * * thus, P1 initializes buddy with B available. to prevent this * we make "copy" and "mark all PAs" atomic and serialize "drop PA" * against that pair */ ext4_lock_group(sb, grp); list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); spin_lock(pa->pa_obj_lock); list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } /* * creates new preallocated space for given inode */ static noinline_for_stack int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_prealloc_space *pa; struct ext4_group_info *grp; struct ext4_inode_info *ei; /* preallocate only when found space is larger then requested */ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); if (pa == NULL) return -ENOMEM; if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { int winl; int wins; int win; int offs; /* we can't allocate as much as normalizer wants. * so, found space must get proper lstart * to cover original request */ BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); /* we're limited by original request in that * logical block must be covered any way * winl is window we can move our chunk within */ winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; /* also, we should cover whole original request */ wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); /* the smallest one defines real window */ win = min(winl, wins); offs = ac->ac_o_ex.fe_logical % EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (offs && offs < win) win = offs; ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - EXT4_NUM_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); } /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ ac->ac_f_ex = ac->ac_b_ex; pa->pa_lstart = ac->ac_b_ex.fe_logical; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); ext4_mb_use_inode_pa(ac, pa); atomic_add(pa->pa_free, &sbi->s_mb_preallocated); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); pa->pa_obj_lock = &ei->i_prealloc_lock; pa->pa_inode = ac->ac_inode; ext4_lock_group(sb, ac->ac_b_ex.fe_group); list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); spin_lock(pa->pa_obj_lock); list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); spin_unlock(pa->pa_obj_lock); return 0; } /* * creates new preallocated space for locality group inodes belongs to */ static noinline_for_stack int ext4_mb_new_group_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg; struct ext4_prealloc_space *pa; struct ext4_group_info *grp; /* preallocate only when found space is larger then requested */ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); BUG_ON(ext4_pspace_cachep == NULL); pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); if (pa == NULL) return -ENOMEM; /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ ac->ac_f_ex = ac->ac_b_ex; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_lstart = pa->pa_pstart; pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_group_pa(ac, pa); ext4_mb_use_group_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); lg = ac->ac_lg; BUG_ON(lg == NULL); pa->pa_obj_lock = &lg->lg_prealloc_lock; pa->pa_inode = NULL; ext4_lock_group(sb, ac->ac_b_ex.fe_group); list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); /* * We will later add the new pa to the right bucket * after updating the pa_free in ext4_mb_release_context */ return 0; } static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) { int err; if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) err = ext4_mb_new_group_pa(ac); else err = ext4_mb_new_inode_pa(ac); return err; } /* * finds all unused blocks in on-disk bitmap, frees them in * in-core bitmap and buddy. * @pa must be unlinked from inode and group lists, so that * nobody else can find/use it. * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ static noinline_for_stack int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int end; unsigned int next; ext4_group_t group; ext4_grpblk_t bit; unsigned long long grp_blk_start; int free = 0; BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; while (bit < end) { bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); mb_debug(1, " free preallocated %u/%u in group %u\n", (unsigned) ext4_group_first_block_no(sb, group) + bit, (unsigned) next - bit, (unsigned) group); free += next - bit; trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + EXT4_C2B(sbi, bit)), next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } if (free != pa->pa_free) { ext4_msg(e4b->bd_sb, KERN_CRIT, "pa %p: logic %lu, phys. %lu, len %lu", pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, (unsigned long) pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* * pa is already deleted so we use the value obtained * from the bitmap and continue. */ } atomic_add(free, &sbi->s_mb_discarded); return 0; } static noinline_for_stack int ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; ext4_group_t group; ext4_grpblk_t bit; trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); return 0; } /* * releases all preallocations in given group * * first, we need to decide discard policy: * - when do we discard * 1) ENOSPC * - how many do we discard * 1) how many requested */ static noinline_for_stack int ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_group_t group, int needed) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; struct list_head list; struct ext4_buddy e4b; int err; int busy = 0; int free = 0; mb_debug(1, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) return 0; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); ext4_error(sb, "Error %d reading block bitmap for %u", err, group); return 0; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_warning(sb, "Error %d loading buddy information for %u", err, group); put_bh(bitmap_bh); return 0; } if (needed == 0) needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; INIT_LIST_HEAD(&list); repeat: ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { spin_unlock(&pa->pa_lock); busy = 1; continue; } if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } /* seems this one can be freed ... */ pa->pa_deleted = 1; /* we can trust pa_free ... */ free += pa->pa_free; spin_unlock(&pa->pa_lock); list_del(&pa->pa_group_list); list_add(&pa->u.pa_tmp_list, &list); } /* if we still need more blocks and some PAs were used, try again */ if (free < needed && busy) { busy = 0; ext4_unlock_group(sb, group); cond_resched(); goto repeat; } /* found anything to free? */ if (list_empty(&list)) { BUG_ON(free != 0); goto out; } /* now free all selected PAs */ list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { /* remove from object (inode or locality group) */ spin_lock(pa->pa_obj_lock); list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); if (pa->pa_type == MB_GROUP_PA) ext4_mb_release_group_pa(&e4b, pa); else ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); return free; } /* * releases all non-used preallocated blocks for given inode * * It's important to discard preallocations under i_data_sem * We don't want another block to be served from the prealloc * space when we are discarding the inode prealloc space. * * FIXME!! Make sure it is valid at all the call sites */ void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; ext4_group_t group = 0; struct list_head list; struct ext4_buddy e4b; int err; if (!S_ISREG(inode->i_mode)) { /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ return; } mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode); INIT_LIST_HEAD(&list); repeat: /* first, collect all pa's in the inode */ spin_lock(&ei->i_prealloc_lock); while (!list_empty(&ei->i_prealloc_list)) { pa = list_entry(ei->i_prealloc_list.next, struct ext4_prealloc_space, pa_inode_list); BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* this shouldn't happen often - nobody should * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); spin_unlock(&ei->i_prealloc_lock); ext4_msg(sb, KERN_ERR, "uh-oh! used pa while discarding"); WARN_ON(1); schedule_timeout_uninterruptible(HZ); goto repeat; } if (pa->pa_deleted == 0) { pa->pa_deleted = 1; spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_inode_list); list_add(&pa->u.pa_tmp_list, &list); continue; } /* someone is deleting pa right now */ spin_unlock(&pa->pa_lock); spin_unlock(&ei->i_prealloc_lock); /* we have to wait here because pa_deleted * doesn't mean pa is already unlinked from * the list. as we might be called from * ->clear_inode() the inode will get freed * and concurrent thread which is unlinking * pa from inode's list may access already * freed memory, bad-bad-bad */ /* XXX: if this happens too often, we can * add a flag to force wait only in case * of ->clear_inode(), but not in case of * regular truncate */ schedule_timeout_uninterruptible(HZ); goto repeat; } spin_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { ext4_error(sb, "Error %d loading buddy information for %u", err, group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); ext4_error(sb, "Error %d reading block bitmap for %u", err, group); ext4_mb_unload_buddy(&e4b); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } } #ifdef CONFIG_EXT4_DEBUG static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; ext4_group_t ngroups, i; if (!ext4_mballoc_debug || (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) return; ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" " Allocation context details:"); ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", ac->ac_status, ac->ac_flags); ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, (unsigned long)ac->ac_o_ex.fe_len, (unsigned long)ac->ac_o_ex.fe_logical, (unsigned long)ac->ac_g_ex.fe_group, (unsigned long)ac->ac_g_ex.fe_start, (unsigned long)ac->ac_g_ex.fe_len, (unsigned long)ac->ac_g_ex.fe_logical, (unsigned long)ac->ac_b_ex.fe_group, (unsigned long)ac->ac_b_ex.fe_start, (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; ext4_grpblk_t start; struct list_head *cur; ext4_lock_group(sb, i); list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); spin_lock(&pa->pa_lock); ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); printk(KERN_ERR "PA:%u:%d:%u \n", i, start, pa->pa_len); } ext4_unlock_group(sb, i); if (grp->bb_free == 0) continue; printk(KERN_ERR "%u: %d/%d \n", i, grp->bb_free, grp->bb_fragments); } printk(KERN_ERR "\n"); } #else static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { return; } #endif /* * We use locality group preallocation for small size file. The size of the * file is determined by the current size or the resulting size after * allocation which ever is larger * * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req */ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits = ac->ac_sb->s_blocksize_bits; loff_t size, isize; if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; if ((size == isize) && !ext4_fs_is_busy(sbi) && (atomic_read(&ac->ac_inode->i_writecount) == 0)) { ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } if (sbi->s_mb_group_prealloc <= 0) { ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; } /* don't use group allocation for large files */ size = max(size, isize); if (size > sbi->s_mb_stream_request) { ac->ac_flags |= EXT4_MB_STREAM_ALLOC; return; } BUG_ON(ac->ac_lg != NULL); /* * locality group prealloc space are per cpu. The reason for having * per cpu locality group is to reduce the contention between block * request from multiple CPUs. */ ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); /* we're going to use group allocation */ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; /* serialize all allocations in the group */ mutex_lock(&ac->ac_lg->lg_mutex); } static noinline_for_stack int ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_group_t group; unsigned int len; ext4_fsblk_t goal; ext4_grpblk_t block; /* we can't allocate > group size */ len = ar->len; /* just a dirty hack to filter too big requests */ if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) len = EXT4_CLUSTERS_PER_GROUP(sb); /* start searching from the goal */ goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || goal >= ext4_blocks_count(es)) goal = le32_to_cpu(es->s_first_data_block); ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; ac->ac_o_ex.fe_group = group; ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; ac->ac_g_ex = ac->ac_o_ex; ac->ac_flags = ar->flags; /* we have to define context: we'll we work with a file or * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " "left: %u/%u, right %u/%u to %swritable\n", (unsigned) ar->len, (unsigned) ar->logical, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, atomic_read(&ar->inode->i_writecount) ? "" : "non-"); return 0; } static noinline_for_stack void ext4_mb_discard_lg_preallocations(struct super_block *sb, struct ext4_locality_group *lg, int order, int total_entries) { ext4_group_t group = 0; struct ext4_buddy e4b; struct list_head discard_list; struct ext4_prealloc_space *pa, *tmp; mb_debug(1, "discard locality group preallocation\n"); INIT_LIST_HEAD(&discard_list); spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], pa_inode_list) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* * This is the pa that we just used * for block allocation. So don't * free that */ spin_unlock(&pa->pa_lock); continue; } if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } /* only lg prealloc space */ BUG_ON(pa->pa_type != MB_GROUP_PA); /* seems this one can be freed ... */ pa->pa_deleted = 1; spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_inode_list); list_add(&pa->u.pa_tmp_list, &discard_list); total_entries--; if (total_entries <= 5) { /* * we want to keep only 5 entries * allowing it to grow to 8. This * mak sure we don't call discard * soon for this list. */ break; } } spin_unlock(&lg->lg_prealloc_lock); list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { int err; group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { ext4_error(sb, "Error %d loading buddy information for %u", err, group); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); ext4_mb_release_group_pa(&e4b, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } } /* * We have incremented pa_count. So it cannot be freed at this * point. Also we hold lg_mutex. So no parallel allocation is * possible from this lg. That means pa_free cannot be updated. * * A parallel ext4_mb_discard_group_preallocations is possible. * which can cause the lg_prealloc_list to be updated. */ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) { int order, added = 0, lg_prealloc_count = 1; struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg = ac->ac_lg; struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; order = fls(pa->pa_free) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; /* Add the prealloc space to lg */ spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], pa_inode_list) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { spin_unlock(&tmp_pa->pa_lock); continue; } if (!added && pa->pa_free < tmp_pa->pa_free) { /* Add to the tail of the previous entry */ list_add_tail_rcu(&pa->pa_inode_list, &tmp_pa->pa_inode_list); added = 1; /* * we want to count the total * number of entries in the list */ } spin_unlock(&tmp_pa->pa_lock); lg_prealloc_count++; } if (!added) list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list[order]); spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ if (lg_prealloc_count > 8) { ext4_mb_discard_lg_preallocations(sb, lg, order, lg_prealloc_count); return; } return ; } /* * release all resource we used in allocation */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); } } if (pa) { /* * We want to add the pa to the right bucket. * Remove it from the list and while adding * make sure the list to which we are adding * doesn't grow big. */ if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { spin_lock(pa->pa_obj_lock); list_del_rcu(&pa->pa_inode_list); spin_unlock(pa->pa_obj_lock); ext4_mb_add_n_trim(ac); } ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_page) put_page(ac->ac_bitmap_page); if (ac->ac_buddy_page) put_page(ac->ac_buddy_page); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); return 0; } static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) { ext4_group_t i, ngroups = ext4_get_groups_count(sb); int ret; int freed = 0; trace_ext4_mb_discard_preallocations(sb, needed); for (i = 0; i < ngroups && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, needed); freed += ret; needed -= ret; } return freed; } /* * Main entry point into mballoc to allocate blocks * it tries to use preallocation first, then falls back * to usual allocation */ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_allocation_request *ar, int *errp) { int freed; struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; unsigned int inquota = 0; unsigned int reserv_clstrs = 0; might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); trace_ext4_request_blocks(ar); /* Allow to use superuser reservation for quota file */ if (ext4_is_quota_file(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { /* Without delayed allocation we need to verify * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. */ while (ar->len && ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { *errp = -ENOSPC; return 0; } reserv_clstrs = ar->len; if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { dquot_alloc_block_nofail(ar->inode, EXT4_C2B(sbi, ar->len)); } else { while (ar->len && dquot_alloc_block(ar->inode, EXT4_C2B(sbi, ar->len))) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; } } inquota = ar->len; if (ar->len == 0) { *errp = -EDQUOT; goto out; } } ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { ar->len = 0; *errp = -ENOMEM; goto out; } *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; goto out; } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; if (!ext4_mb_use_preallocated(ac)) { ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); if (*errp) goto discard_and_exit; /* as we've just preallocated more space than * user requested originally, we store allocated * space in a special descriptor */ if (ac->ac_status == AC_STATUS_FOUND && ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) *errp = ext4_mb_new_preallocation(ac); if (*errp) { discard_and_exit: ext4_discard_allocated_blocks(ac); goto errout; } } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); if (*errp) { ext4_discard_allocated_blocks(ac); goto errout; } else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } } else { freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); if (freed) goto repeat; *errp = -ENOSPC; } errout: if (*errp) { ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } ext4_mb_release_context(ac); out: if (ac) kmem_cache_free(ext4_ac_cachep, ac); if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); } trace_ext4_allocate_blocks(ar, (unsigned long long)block); return block; } /* * We can merge two free data extents only if the physical blocks * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, struct ext4_free_data *entry, struct ext4_free_data *new_entry, struct rb_root *entry_rb_root) { if ((entry->efd_tid != new_entry->efd_tid) || (entry->efd_group != new_entry->efd_group)) return; if (entry->efd_start_cluster + entry->efd_count == new_entry->efd_start_cluster) { new_entry->efd_start_cluster = entry->efd_start_cluster; new_entry->efd_count += entry->efd_count; } else if (new_entry->efd_start_cluster + new_entry->efd_count == entry->efd_start_cluster) { new_entry->efd_count += entry->efd_count; } else return; spin_lock(&sbi->s_md_lock); list_del(&entry->efd_list); spin_unlock(&sbi->s_md_lock); rb_erase(&entry->efd_node, entry_rb_root); kmem_cache_free(ext4_free_data_cachep, entry); } static noinline_for_stack int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { ext4_group_t group = e4b->bd_group; ext4_grpblk_t cluster; ext4_grpblk_t clusters = new_entry->efd_count; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct rb_node **n = &db->bb_free_root.rb_node, *node; struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; if (!*n) { /* first free block exent. We need to protect buddy cache from being freed, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ get_page(e4b->bd_buddy_page); get_page(e4b->bd_bitmap_page); } while (*n) { parent = *n; entry = rb_entry(parent, struct ext4_free_data, efd_node); if (cluster < entry->efd_start_cluster) n = &(*n)->rb_left; else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); kmem_cache_free(ext4_free_data_cachep, new_entry); return 0; } } rb_link_node(new_node, parent, n); rb_insert_color(new_node, &db->bb_free_root); /* Now try to see the extent can be merged to left and right */ node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); ext4_try_merge_freed_extent(sbi, entry, new_entry, &(db->bb_free_root)); } node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); ext4_try_merge_freed_extent(sbi, entry, new_entry, &(db->bb_free_root)); } spin_lock(&sbi->s_md_lock); list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); return 0; } /** * ext4_free_blocks() -- Free given blocks and update quota * @handle: handle for this transaction * @inode: inode * @block: start physical block to free * @count: number of blocks to count * @flags: flags used by ext4_free_blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags) { struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; struct ext4_group_desc *gdp; unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; int ret; might_sleep(); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); else block = bh->b_blocknr; } sbi = EXT4_SB(sb); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); goto error_return; } ext4_debug("freeing block %llu\n", block); trace_ext4_free_blocks(inode, block, count, flags); if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { BUG_ON(count > 1); ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, bh, block); } /* * If the extent to be freed does not begin on a cluster * boundary, we need to deal with partial clusters at the * beginning and end of the extent. Normally we will free * blocks at the beginning or the end unless we are explicitly * requested to avoid doing so. */ overflow = EXT4_PBLK_COFF(sbi, block); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { overflow = sbi->s_cluster_ratio - overflow; block += overflow; if (count > overflow) count -= overflow; else return; } else { block -= overflow; count += overflow; } } overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { if (count > overflow) count -= overflow; else return; } else count += sbi->s_cluster_ratio - overflow; } if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { int i; int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; for (i = 0; i < count; i++) { cond_resched(); if (is_metadata) bh = sb_find_get_block(inode->i_sb, block + i); ext4_forget(handle, is_metadata, inode, bh, block + i); } } do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT( ext4_get_group_info(sb, block_group)))) return; /* * Check to see if we are freeing blocks across a group * boundary. */ if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { overflow = EXT4_C2B(sbi, bit) + count - EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; } count_clusters = EXT4_NUM_B2C(sbi, count); bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); bitmap_bh = NULL; goto error_return; } gdp = ext4_get_group_desc(sb, block_group, &gd_bh); if (!gdp) { err = -EIO; goto error_return; } if (in_range(ext4_block_bitmap(sb, gdp), block, count) || in_range(ext4_inode_bitmap(sb, gdp), block, count) || in_range(block, ext4_inode_table(sb, gdp), sbi->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, gdp), sbi->s_itb_per_group)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_return; } BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, bitmap_bh); if (err) goto error_return; /* * We are about to modify some metadata. Call the journal APIs * to unshare ->b_data if a currently-committing transaction is * using it */ BUFFER_TRACE(gd_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gd_bh); if (err) goto error_return; #ifdef AGGRESSIVE_CHECK { int i; for (i = 0; i < count_clusters; i++) BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) goto error_return; /* * We need to make sure we don't reuse the freed block until after the * transaction is committed. We make an exception if the inode is to be * written in writeback mode since writeback mode has weak data * consistency guarantees. */ if (ext4_handle_valid(handle) && ((flags & EXT4_FREE_BLOCKS_METADATA) || !ext4_should_writeback_data(inode))) { struct ext4_free_data *new_entry; /* * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed * to fail. */ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS|__GFP_NOFAIL); new_entry->efd_start_cluster = bit; new_entry->efd_group = block_group; new_entry->efd_count = count_clusters; new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { /* need to update group_info->bb_free and bitmap * with group lock held. generate_buddy look at * them with group lock_held */ if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, count, NULL); if (err && err != -EOPNOTSUPP) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%d block:%d count:%lu failed" " with %d", block_group, bit, count, err); } else EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); } ret = ext4_free_group_clusters(sb, gdp) + count_clusters; ext4_free_group_clusters_set(sb, gdp, ret); ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); atomic64_add(count_clusters, &sbi_array_rcu_deref(sbi, s_flex_groups, flex_group)->free_clusters); } if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); ext4_mb_unload_buddy(&e4b); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); if (!err) err = ret; if (overflow && !err) { block += count; count = overflow; put_bh(bitmap_bh); goto do_more; } error_return: brelse(bitmap_bh); ext4_std_error(sb, err); return; } /** * ext4_group_add_blocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block * @block: start physical block to add to the block group * @count: number of blocks to free * * This marks the blocks as free in the bitmap and buddy. */ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { struct buffer_head *bitmap_bh = NULL; struct buffer_head *gd_bh; ext4_group_t block_group; ext4_grpblk_t bit; unsigned int i; struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; int err = 0, ret, free_clusters_count; ext4_grpblk_t clusters_freed; ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); unsigned long cluster_count = last_cluster - first_cluster + 1; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); if (count == 0) return 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); /* * Check to see if we are freeing blocks across a group * boundary. */ if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { ext4_warning(sb, "too many blocks added to group %u", block_group); err = -EINVAL; goto error_return; } bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); bitmap_bh = NULL; goto error_return; } desc = ext4_get_group_desc(sb, block_group, &gd_bh); if (!desc) { err = -EIO; goto error_return; } if (in_range(ext4_block_bitmap(sb, desc), block, count) || in_range(ext4_inode_bitmap(sb, desc), block, count) || in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, desc), sbi->s_itb_per_group)) { ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); err = -EINVAL; goto error_return; } BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, bitmap_bh); if (err) goto error_return; /* * We are about to modify some metadata. Call the journal APIs * to unshare ->b_data if a currently-committing transaction is * using it */ BUFFER_TRACE(gd_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gd_bh); if (err) goto error_return; for (i = 0, clusters_freed = 0; i < cluster_count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { ext4_error(sb, "bit already cleared for block %llu", (ext4_fsblk_t)(block + i)); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { clusters_freed++; } } err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) goto error_return; /* * need to update group_info->bb_free and bitmap * with group lock held. generate_buddy look at * them with group lock_held */ ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); mb_free_blocks(NULL, &e4b, bit, cluster_count); free_clusters_count = clusters_freed + ext4_free_group_clusters(sb, desc); ext4_free_group_clusters_set(sb, desc, free_clusters_count); ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, clusters_freed); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); atomic64_add(clusters_freed, &sbi_array_rcu_deref(sbi, s_flex_groups, flex_group)->free_clusters); } ext4_mb_unload_buddy(&e4b); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); if (!err) err = ret; error_return: brelse(bitmap_bh); ext4_std_error(sb, err); return err; } /** * ext4_trim_extent -- function to TRIM one single free extent in the group * @sb: super block for the file system * @start: starting block of the free extent in the alloc. group * @count: number of blocks to TRIM * @group: alloc. group we are working with * @e4b: ext4 buddy for the group * * Trim "count" blocks starting at "start" in the "group". To assure that no * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) __releases(bitlock) __acquires(bitlock) { struct ext4_free_extent ex; int ret = 0; trace_ext4_trim_extent(sb, group, start, count); assert_spin_locked(ext4_group_lock_ptr(sb, group)); ex.fe_start = start; ex.fe_group = group; ex.fe_len = count; /* * Mark blocks used, so no one can reuse them while * being trimmed. */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); ret = ext4_issue_discard(sb, group, start, count, NULL); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; } /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system * @group: group to be trimmed * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count * * ext4_trim_all_free walks through group's buddy bitmap searching for free * extents. When the free block is found, ext4_trim_extent is called to TRIM * the extent. * * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy * bitmap. Then issue a TRIM command on this extent and free the extent in * the group buddy bitmap. This is done until whole group is scanned. */ static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) { void *bitmap; ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; int ret = 0; trace_ext4_trim_all_free(sb, group, start, max); ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { ext4_warning(sb, "Error %d loading buddy information for %u", ret, group); return ret; } bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) goto out; start = (e4b.bd_info->bb_first_free > start) ? e4b.bd_info->bb_first_free : start; while (start <= max) { start = mb_find_next_zero_bit(bitmap, max + 1, start); if (start > max) break; next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { ret = ext4_trim_extent(sb, start, next - start, group, &e4b); if (ret && ret != -EOPNOTSUPP) break; ret = 0; count += next - start; } free_count += next - start; start = next + 1; if (fatal_signal_pending(current)) { count = -ERESTARTSYS; break; } if (need_resched()) { ext4_unlock_group(sb, group); cond_resched(); ext4_lock_group(sb, group); } if ((e4b.bd_info->bb_free - free_count) < minblocks) break; } if (!ret) { ret = count; EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); } out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", count, group); return ret; } /** * ext4_trim_fs() -- trim ioctl handle function * @sb: superblock for filesystem * @range: fstrim_range structure * * start: First Byte to trim * len: number of Bytes to trim from start * minlen: minimum extent length in Bytes * ext4_trim_fs goes through all allocation groups containing Bytes from * start to start+len. For each such a group ext4_trim_all_free function * is invoked to trim all free space. */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { struct request_queue *q = bdev_get_queue(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; uint64_t start, end, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); int ret = 0; start = range->start >> sb->s_blocksize_bits; end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = EXT4_NUM_B2C(EXT4_SB(sb), range->minlen >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || start >= max_blks || range->len < sb->s_blocksize) return -EINVAL; /* No point to try to trim less than discard granularity */ if (range->minlen < q->limits.discard_granularity) { minlen = EXT4_NUM_B2C(EXT4_SB(sb), q->limits.discard_granularity >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } if (end >= max_blks) end = max_blks - 1; if (end <= first_data_blk) goto out; if (start < first_data_blk) start = first_data_blk; /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, &first_group, &first_cluster); ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, &last_group, &last_cluster); /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) break; } /* * For all the groups except the last one, last cluster will * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ if (group == last_group) end = last_cluster; if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, end, minlen); if (cnt < 0) { ret = cnt; break; } trimmed += cnt; } /* * For every group except the first one, we are sure * that the first cluster to discard will be cluster #0. */ first_cluster = 0; } if (!ret) atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); out: range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; return ret; } /* Iterate all the free extents in the group. */ int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t end, ext4_mballoc_query_range_fn formatter, void *priv) { void *bitmap; ext4_grpblk_t next; struct ext4_buddy e4b; int error; error = ext4_mb_load_buddy(sb, group, &e4b); if (error) return error; bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); start = (e4b.bd_info->bb_first_free > start) ? e4b.bd_info->bb_first_free : start; if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; while (start <= end) { start = mb_find_next_zero_bit(bitmap, end + 1, start); if (start > end) break; next = mb_find_next_bit(bitmap, end + 1, start); ext4_unlock_group(sb, group); error = formatter(sb, group, start, next - start, priv); if (error) goto out_unload; ext4_lock_group(sb, group); start = next + 1; } ext4_unlock_group(sb, group); out_unload: ext4_mb_unload_buddy(&e4b); return error; } |
925 924 925 925 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 | #include <linux/gfp.h> #include <linux/initrd.h> #include <linux/ioport.h> #include <linux/swap.h> #include <linux/memblock.h> #include <linux/bootmem.h> /* for max_low_pfn */ #include <linux/swapfile.h> #include <linux/swapops.h> #include <linux/kmemleak.h> #include <asm/set_memory.h> #include <asm/e820/api.h> #include <asm/init.h> #include <asm/page.h> #include <asm/page_types.h> #include <asm/sections.h> #include <asm/setup.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/proto.h> #include <asm/dma.h> /* for MAX_DMA_PFN */ #include <asm/microcode.h> #include <asm/kaslr.h> #include <asm/hypervisor.h> #include <asm/cpufeature.h> #include <asm/pti.h> /* * We need to define the tracepoints somewhere, and tlb.c * is only compied when SMP=y. */ #define CREATE_TRACE_POINTS #include <trace/events/tlb.h> #include "mm_internal.h" /* * Tables translating between page_cache_type_t and pte encoding. * * The default values are defined statically as minimal supported mode; * WC and WT fall back to UC-. pat_init() updates these values to support * more cache modes, WC and WT, when it is safe to do so. See pat_init() * for the details. Note, __early_ioremap() used during early boot-time * takes pgprot_t (pte encoding) and does not use these tables. * * Index into __cachemode2pte_tbl[] is the cachemode. * * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. */ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WB ] = 0 | 0 , [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD, [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; EXPORT_SYMBOL(__cachemode2pte_tbl); uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC, [__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB, [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; EXPORT_SYMBOL(__pte2cachemode_tbl); static unsigned long __initdata pgt_buf_start; static unsigned long __initdata pgt_buf_end; static unsigned long __initdata pgt_buf_top; static unsigned long min_pfn_mapped; static bool __initdata can_use_brk_pgt = true; /* * Pages returned are already directly mapped. * * Changing that is likely to break Xen, see commit: * * 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve * * for detailed information. */ __ref void *alloc_low_pages(unsigned int num) { unsigned long pfn; int i; if (after_bootmem) { unsigned int order; order = get_order((unsigned long)num << PAGE_SHIFT); return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); } if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { unsigned long ret = 0; if (min_pfn_mapped < max_pfn_mapped) { ret = memblock_find_in_range( min_pfn_mapped << PAGE_SHIFT, max_pfn_mapped << PAGE_SHIFT, PAGE_SIZE * num , PAGE_SIZE); } if (ret) memblock_reserve(ret, PAGE_SIZE * num); else if (can_use_brk_pgt) ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); if (!ret) panic("alloc_low_pages: can not alloc memory"); pfn = ret >> PAGE_SHIFT; } else { pfn = pgt_buf_end; pgt_buf_end += num; } for (i = 0; i < num; i++) { void *adr; adr = __va((pfn + i) << PAGE_SHIFT); clear_page(adr); } return __va(pfn << PAGE_SHIFT); } /* * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. * With KASLR memory randomization, depending on the machine e820 memory * and the PUD alignment. We may need twice more pages when KASLR memory * randomization is enabled. */ #ifndef CONFIG_RANDOMIZE_MEMORY #define INIT_PGD_PAGE_COUNT 6 #else #define INIT_PGD_PAGE_COUNT 12 #endif #define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); void __init early_alloc_pgt_buf(void) { unsigned long tables = INIT_PGT_BUF_SIZE; phys_addr_t base; base = __pa(extend_brk(tables, PAGE_SIZE)); pgt_buf_start = base >> PAGE_SHIFT; pgt_buf_end = pgt_buf_start; pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); } int after_bootmem; early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); struct map_range { unsigned long start; unsigned long end; unsigned page_size_mask; }; static int page_size_mask; static void __init probe_page_size_mask(void) { /* * For pagealloc debugging, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; else direct_gbpages = 0; /* Enable PSE if available */ if (boot_cpu_has(X86_FEATURE_PSE)) cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ __supported_pte_mask &= ~_PAGE_GLOBAL; if (boot_cpu_has(X86_FEATURE_PGE)) { cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } /* By the default is everything supported: */ __default_kernel_pte_mask = __supported_pte_mask; /* Except when with PTI where the kernel is mostly non-Global: */ if (cpu_feature_enabled(X86_FEATURE_PTI)) __default_kernel_pte_mask &= ~_PAGE_GLOBAL; /* Enable 1 GB linear kernel mappings if available: */ if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) { printk(KERN_INFO "Using GB pages for direct mapping\n"); page_size_mask |= 1 << PG_LEVEL_1G; } else { direct_gbpages = 0; } } static void setup_pcid(void) { if (!IS_ENABLED(CONFIG_X86_64)) return; if (!boot_cpu_has(X86_FEATURE_PCID)) return; if (boot_cpu_has(X86_FEATURE_PGE)) { /* * This can't be cr4_set_bits_and_update_boot() -- the * trampoline code can't handle CR4.PCIDE and it wouldn't * do any good anyway. Despite the name, * cr4_set_bits_and_update_boot() doesn't actually cause * the bits in question to remain set all the way through * the secondary boot asm. * * Instead, we brute-force it and set CR4.PCIDE manually in * start_secondary(). */ cr4_set_bits(X86_CR4_PCIDE); /* * INVPCID's single-context modes (2/3) only work if we set * X86_CR4_PCIDE, *and* we INVPCID support. It's unusable * on systems that have X86_CR4_PCIDE clear, or that have * no INVPCID support at all. */ if (boot_cpu_has(X86_FEATURE_INVPCID)) setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE); } else { /* * flush_tlb_all(), as currently implemented, won't work if * PCID is on but PGE is not. Since that combination * doesn't exist on real hardware, there's no reason to try * to fully support it, but it's polite to avoid corrupting * data if we're on an improperly configured VM. */ setup_clear_cpu_cap(X86_FEATURE_PCID); } } #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 #else /* CONFIG_X86_64 */ #define NR_RANGE_MR 5 #endif static int __meminit save_mr(struct map_range *mr, int nr_range, unsigned long start_pfn, unsigned long end_pfn, unsigned long page_size_mask) { if (start_pfn < end_pfn) { if (nr_range >= NR_RANGE_MR) panic("run out of range for init_memory_mapping\n"); mr[nr_range].start = start_pfn<<PAGE_SHIFT; mr[nr_range].end = end_pfn<<PAGE_SHIFT; mr[nr_range].page_size_mask = page_size_mask; nr_range++; } return nr_range; } /* * adjust the page_size_mask for small range to go with * big page size instead small one if nearby are ram too. */ static void __ref adjust_range_page_size_mask(struct map_range *mr, int nr_range) { int i; for (i = 0; i < nr_range; i++) { if ((page_size_mask & (1<<PG_LEVEL_2M)) && !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) { unsigned long start = round_down(mr[i].start, PMD_SIZE); unsigned long end = round_up(mr[i].end, PMD_SIZE); #ifdef CONFIG_X86_32 if ((end >> PAGE_SHIFT) > max_low_pfn) continue; #endif if (memblock_is_region_memory(start, end - start)) mr[i].page_size_mask |= 1<<PG_LEVEL_2M; } if ((page_size_mask & (1<<PG_LEVEL_1G)) && !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) { unsigned long start = round_down(mr[i].start, PUD_SIZE); unsigned long end = round_up(mr[i].end, PUD_SIZE); if (memblock_is_region_memory(start, end - start)) mr[i].page_size_mask |= 1<<PG_LEVEL_1G; } } } static const char *page_size_string(struct map_range *mr) { static const char str_1g[] = "1G"; static const char str_2m[] = "2M"; static const char str_4m[] = "4M"; static const char str_4k[] = "4k"; if (mr->page_size_mask & (1<<PG_LEVEL_1G)) return str_1g; /* * 32-bit without PAE has a 4M large page size. * PG_LEVEL_2M is misnamed, but we can at least * print out the right size in the string. */ if (IS_ENABLED(CONFIG_X86_32) && !IS_ENABLED(CONFIG_X86_PAE) && mr->page_size_mask & (1<<PG_LEVEL_2M)) return str_4m; if (mr->page_size_mask & (1<<PG_LEVEL_2M)) return str_2m; return str_4k; } static int __meminit split_mem_range(struct map_range *mr, int nr_range, unsigned long start, unsigned long end) { unsigned long start_pfn, end_pfn, limit_pfn; unsigned long pfn; int i; limit_pfn = PFN_DOWN(end); /* head if not big page alignment ? */ pfn = start_pfn = PFN_DOWN(start); #ifdef CONFIG_X86_32 /* * Don't use a large page for the first 2/4MB of memory * because there are often fixed size MTRRs in there * and overlapping MTRRs into large pages can cause * slowdowns. */ if (pfn == 0) end_pfn = PFN_DOWN(PMD_SIZE); else end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #endif if (end_pfn > limit_pfn) end_pfn = limit_pfn; if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); pfn = end_pfn; } /* big page (2M) range */ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); #ifdef CONFIG_X86_32 end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #else /* CONFIG_X86_64 */ end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE))) end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); #endif if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<<PG_LEVEL_2M)); pfn = end_pfn; } #ifdef CONFIG_X86_64 /* big page (1G) range */ start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE)); end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); pfn = end_pfn; } /* tail is not big page (1G) alignment */ start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE)); end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE)); if (start_pfn < end_pfn) { nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, page_size_mask & (1<<PG_LEVEL_2M)); pfn = end_pfn; } #endif /* tail is not big page (2M) alignment */ start_pfn = pfn; end_pfn = limit_pfn; nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); if (!after_bootmem) adjust_range_page_size_mask(mr, nr_range); /* try to merge same page size and continuous */ for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { unsigned long old_start; if (mr[i].end != mr[i+1].start || mr[i].page_size_mask != mr[i+1].page_size_mask) continue; /* move it */ old_start = mr[i].start; memmove(&mr[i], &mr[i+1], (nr_range - 1 - i) * sizeof(struct map_range)); mr[i--].start = old_start; nr_range--; } for (i = 0; i < nr_range; i++) pr_debug(" [mem %#010lx-%#010lx] page %s\n", mr[i].start, mr[i].end - 1, page_size_string(&mr[i])); return nr_range; } struct range pfn_mapped[E820_MAX_ENTRIES]; int nr_pfn_mapped; static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn) { nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_MAX_ENTRIES, nr_pfn_mapped, start_pfn, end_pfn); nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_MAX_ENTRIES); max_pfn_mapped = max(max_pfn_mapped, end_pfn); if (start_pfn < (1UL<<(32-PAGE_SHIFT))) max_low_pfn_mapped = max(max_low_pfn_mapped, min(end_pfn, 1UL<<(32-PAGE_SHIFT))); } bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn) { int i; for (i = 0; i < nr_pfn_mapped; i++) if ((start_pfn >= pfn_mapped[i].start) && (end_pfn <= pfn_mapped[i].end)) return true; return false; } /* * Setup the direct mapping of the physical memory at PAGE_OFFSET. * This runs before bootmem is initialized and gets pages directly from * the physical memory. To access them they are temporarily mapped. */ unsigned long __ref init_memory_mapping(unsigned long start, unsigned long end) { struct map_range mr[NR_RANGE_MR]; unsigned long ret = 0; int nr_range, i; pr_debug("init_memory_mapping: [mem %#010lx-%#010lx]\n", start, end - 1); memset(mr, 0, sizeof(mr)); nr_range = split_mem_range(mr, 0, start, end); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, mr[i].page_size_mask); add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT); return ret >> PAGE_SHIFT; } /* * We need to iterate through the E820 memory map and create direct mappings * for only E820_TYPE_RAM and E820_KERN_RESERVED regions. We cannot simply * create direct mappings for all pfns from [0 to max_low_pfn) and * [4GB to max_pfn) because of possible memory holes in high addresses * that cannot be marked as UC by fixed/variable range MTRRs. * Depending on the alignment of E820 ranges, this may possibly result * in using smaller size (i.e. 4K instead of 2M or 1G) page tables. * * init_mem_mapping() calls init_range_memory_mapping() with big range. * That range would have hole in the middle or ends, and only ram parts * will be mapped in init_range_memory_mapping(). */ static unsigned long __init init_range_memory_mapping( unsigned long r_start, unsigned long r_end) { unsigned long start_pfn, end_pfn; unsigned long mapped_ram_size = 0; int i; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end); u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end); if (start >= end) continue; /* * if it is overlapping with brk pgt, we need to * alloc pgt buf from memblock instead. */ can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >= min(end, (u64)pgt_buf_top<<PAGE_SHIFT); init_memory_mapping(start, end); mapped_ram_size += end - start; can_use_brk_pgt = true; } return mapped_ram_size; } static unsigned long __init get_new_step_size(unsigned long step_size) { /* * Initial mapped size is PMD_SIZE (2M). * We can not set step_size to be PUD_SIZE (1G) yet. * In worse case, when we cross the 1G boundary, and * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) * to map 1G range with PTE. Hence we use one less than the * difference of page table level shifts. * * Don't need to worry about overflow in the top-down case, on 32bit, * when step_size is 0, round_down() returns 0 for start, and that * turns it into 0x100000000ULL. * In the bottom-up case, round_up(x, 0) returns 0 though too, which * needs to be taken into consideration by the code below. */ return step_size << (PMD_SHIFT - PAGE_SHIFT - 1); } /** * memory_map_top_down - Map [map_start, map_end) top down * @map_start: start address of the target memory range * @map_end: end address of the target memory range * * This function will setup direct mapping for memory range * [map_start, map_end) in top-down. That said, the page tables * will be allocated at the end of the memory, and we map the * memory in top-down. */ static void __init memory_map_top_down(unsigned long map_start, unsigned long map_end) { unsigned long real_end, start, last_start; unsigned long step_size; unsigned long addr; unsigned long mapped_ram_size = 0; /* xen has big range in reserved near end of ram, skip it at first.*/ addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); real_end = addr + PMD_SIZE; /* step_size need to be small so pgt_buf from BRK could cover it */ step_size = PMD_SIZE; max_pfn_mapped = 0; /* will get exact value next */ min_pfn_mapped = real_end >> PAGE_SHIFT; last_start = start = real_end; /* * We start from the top (end of memory) and go to the bottom. * The memblock_find_in_range() gets us a block of RAM from the * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages * for page table. */ while (last_start > map_start) { if (last_start > step_size) { start = round_down(last_start - 1, step_size); if (start < map_start) start = map_start; } else start = map_start; mapped_ram_size += init_range_memory_mapping(start, last_start); last_start = start; min_pfn_mapped = last_start >> PAGE_SHIFT; if (mapped_ram_size >= step_size) step_size = get_new_step_size(step_size); } if (real_end < map_end) init_range_memory_mapping(real_end, map_end); } /** * memory_map_bottom_up - Map [map_start, map_end) bottom up * @map_start: start address of the target memory range * @map_end: end address of the target memory range * * This function will setup direct mapping for memory range * [map_start, map_end) in bottom-up. Since we have limited the * bottom-up allocation above the kernel, the page tables will * be allocated just above the kernel and we map the memory * in [map_start, map_end) in bottom-up. */ static void __init memory_map_bottom_up(unsigned long map_start, unsigned long map_end) { unsigned long next, start; unsigned long mapped_ram_size = 0; /* step_size need to be small so pgt_buf from BRK could cover it */ unsigned long step_size = PMD_SIZE; start = map_start; min_pfn_mapped = start >> PAGE_SHIFT; /* * We start from the bottom (@map_start) and go to the top (@map_end). * The memblock_find_in_range() gets us a block of RAM from the * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages * for page table. */ while (start < map_end) { if (step_size && map_end - start > step_size) { next = round_up(start + 1, step_size); if (next > map_end) next = map_end; } else { next = map_end; } mapped_ram_size += init_range_memory_mapping(start, next); start = next; if (mapped_ram_size >= step_size) step_size = get_new_step_size(step_size); } } void __init init_mem_mapping(void) { unsigned long end; pti_check_boottime_disable(); probe_page_size_mask(); setup_pcid(); #ifdef CONFIG_X86_64 end = max_pfn << PAGE_SHIFT; #else end = max_low_pfn << PAGE_SHIFT; #endif /* the ISA range is always mapped regardless of memory holes */ init_memory_mapping(0, ISA_END_ADDRESS); /* Init the trampoline, possibly with KASLR memory offset */ init_trampoline(); /* * If the allocation is in bottom-up direction, we setup direct mapping * in bottom-up, otherwise we setup direct mapping in top-down. */ if (memblock_bottom_up()) { unsigned long kernel_end = __pa_symbol(_end); /* * we need two separate calls here. This is because we want to * allocate page tables above the kernel. So we first map * [kernel_end, end) to make memory above the kernel be mapped * as soon as possible. And then use page tables allocated above * the kernel to map [ISA_END_ADDRESS, kernel_end). */ memory_map_bottom_up(kernel_end, end); memory_map_bottom_up(ISA_END_ADDRESS, kernel_end); } else { memory_map_top_down(ISA_END_ADDRESS, end); } #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { /* can we preseve max_low_pfn ?*/ max_low_pfn = max_pfn; } #else early_ioremap_page_table_range_init(); #endif load_cr3(swapper_pg_dir); __flush_tlb_all(); x86_init.hyper.init_mem_mapping(); early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } /* * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. * * On x86, access has to be given to the first megabyte of RAM because that * area traditionally contains BIOS code and data regions used by X, dosemu, * and similar apps. Since they map the entire memory range, the whole range * must be allowed (for mapping), but any areas that would otherwise be * disallowed are flagged as being "zero filled" instead of rejected. * Access has to be given to non-kernel-ram areas as well, these contain the * PCI mmio resources as well as potential bios/acpi data regions. */ int devmem_is_allowed(unsigned long pagenr) { if (region_intersects(PFN_PHYS(pagenr), PAGE_SIZE, IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE) != REGION_DISJOINT) { /* * For disallowed memory regions in the low 1MB range, * request that the page be shown as all zeros. */ if (pagenr < 256) return 2; return 0; } /* * This must follow RAM test, since System RAM is considered a * restricted resource under CONFIG_STRICT_IOMEM. */ if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) { /* Low 1MB bypasses iomem restrictions. */ if (pagenr < 256) return 1; return 0; } return 1; } void free_init_pages(char *what, unsigned long begin, unsigned long end) { unsigned long begin_aligned, end_aligned; /* Make sure boundaries are page aligned */ begin_aligned = PAGE_ALIGN(begin); end_aligned = end & PAGE_MASK; if (WARN_ON(begin_aligned != begin || end_aligned != end)) { begin = begin_aligned; end = end_aligned; } if (begin >= end) return; /* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will * create a kernel page fault: */ if (debug_pagealloc_enabled()) { pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", begin, end - 1); /* * Inform kmemleak about the hole in the memory since the * corresponding pages will be unmapped. */ kmemleak_free_part((void *)begin, end - begin); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); } else { /* * We just marked the kernel text read only above, now that * we are going to free part of that, we need to make that * writeable and non-executable first. */ set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what); } } /* * begin/end can be in the direct map or the "high kernel mapping" * used for the kernel image only. free_init_pages() will do the * right thing for either kind of address. */ void free_kernel_image_pages(void *begin, void *end) { unsigned long begin_ul = (unsigned long)begin; unsigned long end_ul = (unsigned long)end; unsigned long len_pages = (end_ul - begin_ul) >> PAGE_SHIFT; free_init_pages("unused kernel image", begin_ul, end_ul); /* * PTI maps some of the kernel into userspace. For performance, * this includes some kernel areas that do not contain secrets. * Those areas might be adjacent to the parts of the kernel image * being freed, which may contain secrets. Remove the "high kernel * image mapping" for these freed areas, ensuring they are not even * potentially vulnerable to Meltdown regardless of the specific * optimizations PTI is currently using. * * The "noalias" prevents unmapping the direct map alias which is * needed to access the freed pages. * * This is only valid for 64bit kernels. 32bit has only one mapping * which can't be treated in this way for obvious reasons. */ if (IS_ENABLED(CONFIG_X86_64) && cpu_feature_enabled(X86_FEATURE_PTI)) set_memory_np_noalias(begin_ul, len_pages); } void __weak mem_encrypt_free_decrypted_mem(void) { } void __ref free_initmem(void) { e820__reallocate_tables(); mem_encrypt_free_decrypted_mem(); free_kernel_image_pages(&__init_begin, &__init_end); } #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { /* * end could be not aligned, and We can not align that, * decompresser could be confused by aligned initrd_end * We already reserve the end partial page before in * - i386_start_kernel() * - x86_64_start_kernel() * - relocate_initrd() * So here We can do PAGE_ALIGN() safely to get partial page to be freed */ free_init_pages("initrd", start, PAGE_ALIGN(end)); } #endif /* * Calculate the precise size of the DMA zone (first 16 MB of RAM), * and pass it to the MM layer - to help it set zone watermarks more * accurately. * * Done on 64-bit systems only for the time being, although 32-bit systems * might benefit from this as well. */ void __init memblock_find_dma_reserve(void) { #ifdef CONFIG_X86_64 u64 nr_pages = 0, nr_free_pages = 0; unsigned long start_pfn, end_pfn; phys_addr_t start_addr, end_addr; int i; u64 u; /* * Iterate over all memory ranges (free and reserved ones alike), * to calculate the total number of pages in the first 16 MB of RAM: */ nr_pages = 0; for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { start_pfn = min(start_pfn, MAX_DMA_PFN); end_pfn = min(end_pfn, MAX_DMA_PFN); nr_pages += end_pfn - start_pfn; } /* * Iterate over free memory ranges to calculate the number of free * pages in the DMA zone, while not counting potential partial * pages at the beginning or the end of the range: */ nr_free_pages = 0; for_each_free_mem_range(u, NUMA_NO_NODE, MEMBLOCK_NONE, &start_addr, &end_addr, NULL) { start_pfn = min_t(unsigned long, PFN_UP(start_addr), MAX_DMA_PFN); end_pfn = min_t(unsigned long, PFN_DOWN(end_addr), MAX_DMA_PFN); if (start_pfn < end_pfn) nr_free_pages += end_pfn - start_pfn; } set_dma_reserve(nr_pages - nr_free_pages); #endif } void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn); #endif #ifdef CONFIG_ZONE_DMA32 max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn); #endif max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = max_pfn; #endif free_area_init_nodes(max_zone_pfns); } __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .loaded_mm = &init_mm, .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { /* entry 0 MUST be WB (hardwired to speed up translations) */ BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB); __cachemode2pte_tbl[cache] = __cm_idx2pte(entry); __pte2cachemode_tbl[entry] = cache; } #ifdef CONFIG_SWAP unsigned long max_swapfile_size(void) { unsigned long pages; pages = generic_max_swapfile_size(); if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) { /* Limit the swap file size to MAX_PA/2 for L1TF workaround */ unsigned long long l1tf_limit = l1tf_pfn_limit(); /* * We encode swap offsets also with 3 bits below those for pfn * which makes the usable limit higher. */ #if CONFIG_PGTABLE_LEVELS > 2 l1tf_limit <<= PAGE_SHIFT - SWP_OFFSET_FIRST_BIT; #endif pages = min_t(unsigned long long, l1tf_limit, pages); } return pages; } #endif |
2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 | // SPDX-License-Identifier: GPL-2.0 /* * PCI searching functions * * Copyright (C) 1993 -- 1997 Drew Eckhardt, Frederic Potter, * David Mosberger-Tang * Copyright (C) 1997 -- 2000 Martin Mares <mj@ucw.cz> * Copyright (C) 2003 -- 2004 Greg Kroah-Hartman <greg@kroah.com> */ #include <linux/pci.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/interrupt.h> #include "pci.h" DECLARE_RWSEM(pci_bus_sem); EXPORT_SYMBOL_GPL(pci_bus_sem); /* * pci_for_each_dma_alias - Iterate over DMA aliases for a device * @pdev: starting downstream device * @fn: function to call for each alias * @data: opaque data to pass to @fn * * Starting @pdev, walk up the bus calling @fn for each possible alias * of @pdev at the root bus. */ int pci_for_each_dma_alias(struct pci_dev *pdev, int (*fn)(struct pci_dev *pdev, u16 alias, void *data), void *data) { struct pci_bus *bus; int ret; ret = fn(pdev, PCI_DEVID(pdev->bus->number, pdev->devfn), data); if (ret) return ret; /* * If the device is broken and uses an alias requester ID for * DMA, iterate over that too. */ if (unlikely(pdev->dma_alias_mask)) { u8 devfn; for_each_set_bit(devfn, pdev->dma_alias_mask, U8_MAX) { ret = fn(pdev, PCI_DEVID(pdev->bus->number, devfn), data); if (ret) return ret; } } for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) { struct pci_dev *tmp; /* Skip virtual buses */ if (!bus->self) continue; tmp = bus->self; /* stop at bridge where translation unit is associated */ if (tmp->dev_flags & PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT) return ret; /* * PCIe-to-PCI/X bridges alias transactions from downstream * devices using the subordinate bus number (PCI Express to * PCI/PCI-X Bridge Spec, rev 1.0, sec 2.3). For all cases * where the upstream bus is PCI/X we alias to the bridge * (there are various conditions in the previous reference * where the bridge may take ownership of transactions, even * when the secondary interface is PCI-X). */ if (pci_is_pcie(tmp)) { switch (pci_pcie_type(tmp)) { case PCI_EXP_TYPE_ROOT_PORT: case PCI_EXP_TYPE_UPSTREAM: case PCI_EXP_TYPE_DOWNSTREAM: continue; case PCI_EXP_TYPE_PCI_BRIDGE: ret = fn(tmp, PCI_DEVID(tmp->subordinate->number, PCI_DEVFN(0, 0)), data); if (ret) return ret; continue; case PCI_EXP_TYPE_PCIE_BRIDGE: ret = fn(tmp, PCI_DEVID(tmp->bus->number, tmp->devfn), data); if (ret) return ret; continue; } } else { if (tmp->dev_flags & PCI_DEV_FLAG_PCIE_BRIDGE_ALIAS) ret = fn(tmp, PCI_DEVID(tmp->subordinate->number, PCI_DEVFN(0, 0)), data); else ret = fn(tmp, PCI_DEVID(tmp->bus->number, tmp->devfn), data); if (ret) return ret; } } return ret; } static struct pci_bus *pci_do_find_bus(struct pci_bus *bus, unsigned char busnr) { struct pci_bus *child; struct pci_bus *tmp; if (bus->number == busnr) return bus; list_for_each_entry(tmp, &bus->children, node) { child = pci_do_find_bus(tmp, busnr); if (child) return child; } return NULL; } /** * pci_find_bus - locate PCI bus from a given domain and bus number * @domain: number of PCI domain to search * @busnr: number of desired PCI bus * * Given a PCI bus number and domain number, the desired PCI bus is located * in the global list of PCI buses. If the bus is found, a pointer to its * data structure is returned. If no bus is found, %NULL is returned. */ struct pci_bus *pci_find_bus(int domain, int busnr) { struct pci_bus *bus = NULL; struct pci_bus *tmp_bus; while ((bus = pci_find_next_bus(bus)) != NULL) { if (pci_domain_nr(bus) != domain) continue; tmp_bus = pci_do_find_bus(bus, busnr); if (tmp_bus) return tmp_bus; } return NULL; } EXPORT_SYMBOL(pci_find_bus); /** * pci_find_next_bus - begin or continue searching for a PCI bus * @from: Previous PCI bus found, or %NULL for new search. * * Iterates through the list of known PCI buses. A new search is * initiated by passing %NULL as the @from argument. Otherwise if * @from is not %NULL, searches continue from next device on the * global list. */ struct pci_bus *pci_find_next_bus(const struct pci_bus *from) { struct list_head *n; struct pci_bus *b = NULL; WARN_ON(in_interrupt()); down_read(&pci_bus_sem); n = from ? from->node.next : pci_root_buses.next; if (n != &pci_root_buses) b = list_entry(n, struct pci_bus, node); up_read(&pci_bus_sem); return b; } EXPORT_SYMBOL(pci_find_next_bus); /** * pci_get_slot - locate PCI device for a given PCI slot * @bus: PCI bus on which desired PCI device resides * @devfn: encodes number of PCI slot in which the desired PCI * device resides and the logical device number within that slot * in case of multi-function devices. * * Given a PCI bus and slot/function number, the desired PCI device * is located in the list of PCI devices. * If the device is found, its reference count is increased and this * function returns a pointer to its data structure. The caller must * decrement the reference count by calling pci_dev_put(). * If no device is found, %NULL is returned. */ struct pci_dev *pci_get_slot(struct pci_bus *bus, unsigned int devfn) { struct pci_dev *dev; WARN_ON(in_interrupt()); down_read(&pci_bus_sem); list_for_each_entry(dev, &bus->devices, bus_list) { if (dev->devfn == devfn) goto out; } dev = NULL; out: pci_dev_get(dev); up_read(&pci_bus_sem); return dev; } EXPORT_SYMBOL(pci_get_slot); /** * pci_get_domain_bus_and_slot - locate PCI device for a given PCI domain (segment), bus, and slot * @domain: PCI domain/segment on which the PCI device resides. * @bus: PCI bus on which desired PCI device resides * @devfn: encodes number of PCI slot in which the desired PCI device * resides and the logical device number within that slot in case of * multi-function devices. * * Given a PCI domain, bus, and slot/function number, the desired PCI * device is located in the list of PCI devices. If the device is * found, its reference count is increased and this function returns a * pointer to its data structure. The caller must decrement the * reference count by calling pci_dev_put(). If no device is found, * %NULL is returned. */ struct pci_dev *pci_get_domain_bus_and_slot(int domain, unsigned int bus, unsigned int devfn) { struct pci_dev *dev = NULL; for_each_pci_dev(dev) { if (pci_domain_nr(dev->bus) == domain && (dev->bus->number == bus && dev->devfn == devfn)) return dev; } return NULL; } EXPORT_SYMBOL(pci_get_domain_bus_and_slot); static int match_pci_dev_by_id(struct device *dev, void *data) { struct pci_dev *pdev = to_pci_dev(dev); struct pci_device_id *id = data; if (pci_match_one_device(id, pdev)) return 1; return 0; } /* * pci_get_dev_by_id - begin or continue searching for a PCI device by id * @id: pointer to struct pci_device_id to match for the device * @from: Previous PCI device found in search, or %NULL for new search. * * Iterates through the list of known PCI devices. If a PCI device is found * with a matching id a pointer to its device structure is returned, and the * reference count to the device is incremented. Otherwise, %NULL is returned. * A new search is initiated by passing %NULL as the @from argument. Otherwise * if @from is not %NULL, searches continue from next device on the global * list. The reference count for @from is always decremented if it is not * %NULL. * * This is an internal function for use by the other search functions in * this file. */ static struct pci_dev *pci_get_dev_by_id(const struct pci_device_id *id, struct pci_dev *from) { struct device *dev; struct device *dev_start = NULL; struct pci_dev *pdev = NULL; WARN_ON(in_interrupt()); if (from) dev_start = &from->dev; dev = bus_find_device(&pci_bus_type, dev_start, (void *)id, match_pci_dev_by_id); if (dev) pdev = to_pci_dev(dev); pci_dev_put(from); return pdev; } /** * pci_get_subsys - begin or continue searching for a PCI device by vendor/subvendor/device/subdevice id * @vendor: PCI vendor id to match, or %PCI_ANY_ID to match all vendor ids * @device: PCI device id to match, or %PCI_ANY_ID to match all device ids * @ss_vendor: PCI subsystem vendor id to match, or %PCI_ANY_ID to match all vendor ids * @ss_device: PCI subsystem device id to match, or %PCI_ANY_ID to match all device ids * @from: Previous PCI device found in search, or %NULL for new search. * * Iterates through the list of known PCI devices. If a PCI device is found * with a matching @vendor, @device, @ss_vendor and @ss_device, a pointer to its * device structure is returned, and the reference count to the device is * incremented. Otherwise, %NULL is returned. A new search is initiated by * passing %NULL as the @from argument. Otherwise if @from is not %NULL, * searches continue from next device on the global list. * The reference count for @from is always decremented if it is not %NULL. */ struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device, unsigned int ss_vendor, unsigned int ss_device, struct pci_dev *from) { struct pci_device_id id = { .vendor = vendor, .device = device, .subvendor = ss_vendor, .subdevice = ss_device, }; return pci_get_dev_by_id(&id, from); } EXPORT_SYMBOL(pci_get_subsys); /** * pci_get_device - begin or continue searching for a PCI device by vendor/device id * @vendor: PCI vendor id to match, or %PCI_ANY_ID to match all vendor ids * @device: PCI device id to match, or %PCI_ANY_ID to match all device ids * @from: Previous PCI device found in search, or %NULL for new search. * * Iterates through the list of known PCI devices. If a PCI device is * found with a matching @vendor and @device, the reference count to the * device is incremented and a pointer to its device structure is returned. * Otherwise, %NULL is returned. A new search is initiated by passing %NULL * as the @from argument. Otherwise if @from is not %NULL, searches continue * from next device on the global list. The reference count for @from is * always decremented if it is not %NULL. */ struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device, struct pci_dev *from) { return pci_get_subsys(vendor, device, PCI_ANY_ID, PCI_ANY_ID, from); } EXPORT_SYMBOL(pci_get_device); /** * pci_get_class - begin or continue searching for a PCI device by class * @class: search for a PCI device with this class designation * @from: Previous PCI device found in search, or %NULL for new search. * * Iterates through the list of known PCI devices. If a PCI device is * found with a matching @class, the reference count to the device is * incremented and a pointer to its device structure is returned. * Otherwise, %NULL is returned. * A new search is initiated by passing %NULL as the @from argument. * Otherwise if @from is not %NULL, searches continue from next device * on the global list. The reference count for @from is always decremented * if it is not %NULL. */ struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from) { struct pci_device_id id = { .vendor = PCI_ANY_ID, .device = PCI_ANY_ID, .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, .class_mask = PCI_ANY_ID, .class = class, }; return pci_get_dev_by_id(&id, from); } EXPORT_SYMBOL(pci_get_class); /** * pci_dev_present - Returns 1 if device matching the device list is present, 0 if not. * @ids: A pointer to a null terminated list of struct pci_device_id structures * that describe the type of PCI device the caller is trying to find. * * Obvious fact: You do not have a reference to any device that might be found * by this function, so if that device is removed from the system right after * this function is finished, the value will be stale. Use this function to * find devices that are usually built into a system, or for a general hint as * to if another device happens to be present at this specific moment in time. */ int pci_dev_present(const struct pci_device_id *ids) { struct pci_dev *found = NULL; WARN_ON(in_interrupt()); while (ids->vendor || ids->subvendor || ids->class_mask) { found = pci_get_dev_by_id(ids, NULL); if (found) { pci_dev_put(found); return 1; } ids++; } return 0; } EXPORT_SYMBOL(pci_dev_present); |
23907 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Runtime locking correctness validator * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * see Documentation/locking/lockdep-design.txt for more details. */ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H struct task_struct; struct lockdep_map; /* for sysctl */ extern int prove_locking; extern int lock_stat; #define MAX_LOCKDEP_SUBCLASSES 8UL #include <linux/types.h> #ifdef CONFIG_LOCKDEP #include <linux/linkage.h> #include <linux/list.h> #include <linux/debug_locks.h> #include <linux/stacktrace.h> /* * We'd rather not expose kernel/lockdep_states.h this wide, but we do need * the total number of states... :-( */ #define XXX_LOCK_USAGE_STATES (1+2*4) /* * NR_LOCKDEP_CACHING_CLASSES ... Number of classes * cached in the instance of lockdep_map * * Currently main class (subclass == 0) and signle depth subclass * are cached in lockdep_map. This optimization is mainly targeting * on rq->lock. double_rq_lock() acquires this highly competitive with * single depth. */ #define NR_LOCKDEP_CACHING_CLASSES 2 /* * Lock-classes are keyed via unique addresses, by embedding the * lockclass-key into the kernel (or module) .data section. (For * static locks we use the lock address itself as the key.) */ struct lockdep_subclass_key { char __one_byte; } __attribute__ ((__packed__)); struct lock_class_key { struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; }; extern struct lock_class_key __lockdep_no_validate__; #define LOCKSTAT_POINTS 4 /* * The lock-class itself: */ struct lock_class { /* * class-hash: */ struct hlist_node hash_entry; /* * global list of all lock-classes: */ struct list_head lock_entry; struct lockdep_subclass_key *key; unsigned int subclass; unsigned int dep_gen_id; /* * IRQ/softirq usage tracking bits: */ unsigned long usage_mask; struct stack_trace usage_traces[XXX_LOCK_USAGE_STATES]; /* * These fields represent a directed graph of lock dependencies, * to every node we attach a list of "forward" and a list of * "backward" graph nodes. */ struct list_head locks_after, locks_before; /* * Generation counter, when doing certain classes of graph walking, * to ensure that we check one node only once: */ unsigned int version; /* * Statistics counter: */ unsigned long ops; const char *name; int name_version; #ifdef CONFIG_LOCK_STAT unsigned long contention_point[LOCKSTAT_POINTS]; unsigned long contending_point[LOCKSTAT_POINTS]; #endif }; #ifdef CONFIG_LOCK_STAT struct lock_time { s64 min; s64 max; s64 total; unsigned long nr; }; enum bounce_type { bounce_acquired_write, bounce_acquired_read, bounce_contended_write, bounce_contended_read, nr_bounce_types, bounce_acquired = bounce_acquired_write, bounce_contended = bounce_contended_write, }; struct lock_class_stats { unsigned long contention_point[LOCKSTAT_POINTS]; unsigned long contending_point[LOCKSTAT_POINTS]; struct lock_time read_waittime; struct lock_time write_waittime; struct lock_time read_holdtime; struct lock_time write_holdtime; unsigned long bounces[nr_bounce_types]; }; struct lock_class_stats lock_stats(struct lock_class *class); void clear_lock_stats(struct lock_class *class); #endif /* * Map the lock object (the lock instance) to the lock-class object. * This is embedded into specific lock instances: */ struct lockdep_map { struct lock_class_key *key; struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES]; const char *name; #ifdef CONFIG_LOCK_STAT int cpu; unsigned long ip; #endif }; static inline void lockdep_copy_map(struct lockdep_map *to, struct lockdep_map *from) { int i; *to = *from; /* * Since the class cache can be modified concurrently we could observe * half pointers (64bit arch using 32bit copy insns). Therefore clear * the caches and take the performance hit. * * XXX it doesn't work well with lockdep_set_class_and_subclass(), since * that relies on cache abuse. */ for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) to->class_cache[i] = NULL; } /* * Every lock has a list of other locks that were taken after it. * We only grow the list, never remove from it: */ struct lock_list { struct list_head entry; struct lock_class *class; struct stack_trace trace; int distance; /* * The parent field is used to implement breadth-first search, and the * bit 0 is reused to indicate if the lock has been accessed in BFS. */ struct lock_list *parent; }; /* * We record lock dependency chains, so that we can cache them: */ struct lock_chain { /* see BUILD_BUG_ON()s in lookup_chain_cache() */ unsigned int irq_context : 2, depth : 6, base : 24; /* 4 byte hole */ struct hlist_node entry; u64 chain_key; }; #define MAX_LOCKDEP_KEYS_BITS 13 /* * Subtract one because we offset hlock->class_idx by 1 in order * to make 0 mean no class. This avoids overflowing the class_idx * bitfield and hitting the BUG in hlock_class(). */ #define MAX_LOCKDEP_KEYS ((1UL << MAX_LOCKDEP_KEYS_BITS) - 1) struct held_lock { /* * One-way hash of the dependency chain up to this point. We * hash the hashes step by step as the dependency chain grows. * * We use it for dependency-caching and we skip detection * passes and dependency-updates if there is a cache-hit, so * it is absolutely critical for 100% coverage of the validator * to have a unique key value for every unique dependency path * that can occur in the system, to make a unique hash value * as likely as possible - hence the 64-bit width. * * The task struct holds the current hash value (initialized * with zero), here we store the previous hash value: */ u64 prev_chain_key; unsigned long acquire_ip; struct lockdep_map *instance; struct lockdep_map *nest_lock; #ifdef CONFIG_LOCK_STAT u64 waittime_stamp; u64 holdtime_stamp; #endif unsigned int class_idx:MAX_LOCKDEP_KEYS_BITS; /* * The lock-stack is unified in that the lock chains of interrupt * contexts nest ontop of process context chains, but we 'separate' * the hashes by starting with 0 if we cross into an interrupt * context, and we also keep do not add cross-context lock * dependencies - the lock usage graph walking covers that area * anyway, and we'd just unnecessarily increase the number of * dependencies otherwise. [Note: hardirq and softirq contexts * are separated from each other too.] * * The following field is used to detect when we cross into an * interrupt context: */ unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */ unsigned int trylock:1; /* 16 bits */ unsigned int read:2; /* see lock_acquire() comment */ unsigned int check:1; /* see lock_acquire() comment */ unsigned int hardirqs_off:1; unsigned int references:12; /* 32 bits */ unsigned int pin_count; }; /* * Initialization, self-test and debugging-output methods: */ extern void lockdep_init(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); extern asmlinkage void lockdep_sys_exit(void); extern void lockdep_off(void); extern void lockdep_on(void); /* * These methods are used by specific locking variants (spinlocks, * rwlocks, mutexes and rwsems) to pass init/acquire/release events * to lockdep: */ extern void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass); /* * Reinitialize a lock key - for cases where there is special locking or * special initialization of locks so that the validator gets the scope * of dependencies wrong: they are either too broad (they need a class-split) * or they are too narrow (they suffer from a false class-split): */ #define lockdep_set_class(lock, key) \ lockdep_init_map(&(lock)->dep_map, #key, key, 0) #define lockdep_set_class_and_name(lock, key, name) \ lockdep_init_map(&(lock)->dep_map, name, key, 0) #define lockdep_set_class_and_subclass(lock, key, sub) \ lockdep_init_map(&(lock)->dep_map, #key, key, sub) #define lockdep_set_subclass(lock, sub) \ lockdep_init_map(&(lock)->dep_map, #lock, \ (lock)->dep_map.key, sub) #define lockdep_set_novalidate_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock) /* * Compare locking classes */ #define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key) static inline int lockdep_match_key(struct lockdep_map *lock, struct lock_class_key *key) { return lock->key == key; } /* * Acquire a lock. * * Values for "read": * * 0: exclusive (write) acquire * 1: read-acquire (no recursion allowed) * 2: read-acquire with same-instance recursion allowed * * Values for check: * * 0: simple checks (freeing, held-at-exit-time, etc.) * 1: full validation */ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); /* * Same "read" as for lock_acquire(), except -1 means any. */ extern int lock_is_held_type(const struct lockdep_map *lock, int read); static inline int lock_is_held(const struct lockdep_map *lock) { return lock_is_held_type(lock, -1); } #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) #define lockdep_is_held_type(lock, r) lock_is_held_type(&(lock)->dep_map, (r)) extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip); static inline void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, unsigned long ip) { lock_set_class(lock, lock->name, lock->key, subclass, ip); } extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); struct pin_cookie { unsigned int val; }; #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) #define lockdep_assert_held(l) do { \ WARN_ON(debug_locks && !lockdep_is_held(l)); \ } while (0) #define lockdep_assert_held_exclusive(l) do { \ WARN_ON(debug_locks && !lockdep_is_held_type(l, 0)); \ } while (0) #define lockdep_assert_held_read(l) do { \ WARN_ON(debug_locks && !lockdep_is_held_type(l, 1)); \ } while (0) #define lockdep_assert_held_once(l) do { \ WARN_ON_ONCE(debug_locks && !lockdep_is_held(l)); \ } while (0) #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) #define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) #else /* !CONFIG_LOCKDEP */ static inline void lockdep_off(void) { } static inline void lockdep_on(void) { } # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, n, i) do { } while (0) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) # define lockdep_set_class_and_name(lock, key, name) \ do { (void)(key); (void)(name); } while (0) #define lockdep_set_class_and_subclass(lock, key, sub) \ do { (void)(key); } while (0) #define lockdep_set_subclass(lock, sub) do { } while (0) #define lockdep_set_novalidate_class(lock) do { } while (0) /* * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP * case since the result is not well defined and the caller should rather * #ifdef the call himself. */ # define lockdep_reset() do { debug_locks = 1; } while (0) # define lockdep_free_key_range(start, size) do { } while (0) # define lockdep_sys_exit() do { } while (0) /* * The class key takes no space if lockdep is disabled: */ struct lock_class_key { }; /* * The lockdep_map takes no space if lockdep is disabled: */ struct lockdep_map { }; #define lockdep_depth(tsk) (0) #define lockdep_is_held_type(l, r) (1) #define lockdep_assert_held(l) do { (void)(l); } while (0) #define lockdep_assert_held_exclusive(l) do { (void)(l); } while (0) #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_recursing(tsk) (0) struct pin_cookie { }; #define NIL_COOKIE (struct pin_cookie){ } #define lockdep_pin_lock(l) ({ struct pin_cookie cookie; cookie; }) #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) #endif /* !LOCKDEP */ enum xhlock_context_t { XHLOCK_HARD, XHLOCK_SOFT, XHLOCK_CTX_NR, }; #define lockdep_init_map_crosslock(m, n, k, s) do {} while (0) /* * To initialize a lockdep_map statically use this macro. * Note that _name must not be NULL. */ #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_init_task(struct task_struct *task) {} static inline void lockdep_free_task(struct task_struct *task) {} #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); extern void lock_acquired(struct lockdep_map *lock, unsigned long ip); #define LOCK_CONTENDED(_lock, try, lock) \ do { \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ lock(_lock); \ } \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ ({ \ int ____err = 0; \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ ____err = lock(_lock); \ } \ if (!____err) \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ ____err; \ }) #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) #define lock_acquired(lockdep_map, ip) do {} while (0) #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ lock(_lock) #endif /* CONFIG_LOCK_STAT */ #ifdef CONFIG_LOCKDEP /* * On lockdep we dont want the hand-coded irq-enable of * _raw_*_lock_flags() code, because lockdep assumes * that interrupts are not re-enabled during lock-acquire: */ #define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \ LOCK_CONTENDED((_lock), (try), (lock)) #else /* CONFIG_LOCKDEP */ #define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \ lockfl((_lock), (flags)) #endif /* CONFIG_LOCKDEP */ #ifdef CONFIG_PROVE_LOCKING extern void print_irqtrace_events(struct task_struct *curr); #else static inline void print_irqtrace_events(struct task_struct *curr) { } #endif /* * For trivial one-depth nesting of a lock-class, the following * global define can be used. (Subsystems with multiple levels * of nesting should define their own lock-nesting subclasses.) */ #define SINGLE_DEPTH_NESTING 1 /* * Map the dependency ops to NOP or to real lockdep ops, depending * on the per lock-class debug mode: */ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define spin_release(l, n, i) lock_release(l, n, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwlock_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) #define rwlock_release(l, n, i) lock_release(l, n, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define seqcount_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) #define seqcount_release(l, n, i) lock_release(l, n, i) #define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define mutex_release(l, n, i) lock_release(l, n, i) #define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) #define rwsem_release(l, n, i) lock_release(l, n, i) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, 1, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ } while (0) # define might_lock_read(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ } while (0) #define lockdep_assert_irqs_enabled() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ !current->hardirqs_enabled, \ "IRQs not enabled as expected\n"); \ } while (0) #define lockdep_assert_irqs_disabled() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ current->hardirqs_enabled, \ "IRQs not disabled as expected\n"); \ } while (0) #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) #endif #ifdef CONFIG_LOCKDEP void lockdep_rcu_suspicious(const char *file, const int line, const char *s); #else static inline void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { } #endif #endif /* __LINUX_LOCKDEP_H */ |
37 35 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | #ifndef _LINUX_TIME32_H #define _LINUX_TIME32_H /* * These are all interfaces based on the old time_t definition * that overflows in 2038 on 32-bit architectures. New code * should use the replacements based on time64_t and timespec64. * * Any interfaces in here that become unused as we migrate * code to time64_t should get removed. */ #include <linux/time64.h> #define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) #if __BITS_PER_LONG == 64 /* timespec64 is defined as timespec here */ static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) { return *(const struct timespec *)&ts64; } static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) { return *(const struct timespec64 *)&ts; } #else static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) { struct timespec ret; ret.tv_sec = (time_t)ts64.tv_sec; ret.tv_nsec = ts64.tv_nsec; return ret; } static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) { struct timespec64 ret; ret.tv_sec = ts.tv_sec; ret.tv_nsec = ts.tv_nsec; return ret; } #endif static inline int timespec_equal(const struct timespec *a, const struct timespec *b) { return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } /* * lhs < rhs: return <0 * lhs == rhs: return 0 * lhs > rhs: return >0 */ static inline int timespec_compare(const struct timespec *lhs, const struct timespec *rhs) { if (lhs->tv_sec < rhs->tv_sec) return -1; if (lhs->tv_sec > rhs->tv_sec) return 1; return lhs->tv_nsec - rhs->tv_nsec; } extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec); static inline struct timespec timespec_add(struct timespec lhs, struct timespec rhs) { struct timespec ts_delta; set_normalized_timespec(&ts_delta, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); return ts_delta; } /* * sub = lhs - rhs, in normalized form */ static inline struct timespec timespec_sub(struct timespec lhs, struct timespec rhs) { struct timespec ts_delta; set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec, lhs.tv_nsec - rhs.tv_nsec); return ts_delta; } /* * Returns true if the timespec is norm, false if denorm: */ static inline bool timespec_valid(const struct timespec *ts) { /* Dates before 1970 are bogus */ if (ts->tv_sec < 0) return false; /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; return true; } static inline bool timespec_valid_strict(const struct timespec *ts) { if (!timespec_valid(ts)) return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; return true; } /** * timespec_to_ns - Convert timespec to nanoseconds * @ts: pointer to the timespec variable to be converted * * Returns the scalar nanosecond representation of the timespec * parameter. */ static inline s64 timespec_to_ns(const struct timespec *ts) { return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } /** * ns_to_timespec - Convert nanoseconds to timespec * @nsec: the nanoseconds value to be converted * * Returns the timespec representation of the nsec parameter. */ extern struct timespec ns_to_timespec(const s64 nsec); /** * timespec_add_ns - Adds nanoseconds to a timespec * @a: pointer to timespec to be incremented * @ns: unsigned nanoseconds value to be added * * This must always be inlined because its used from the x86-64 vdso, * which cannot call other kernel functions. */ static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) { a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } /** * time_to_tm - converts the calendar time to local broken-down time * * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, * Coordinated Universal Time (UTC). * @offset offset seconds adding to totalsecs. * @result pointer to struct tm variable to receive broken-down time */ static inline void time_to_tm(time_t totalsecs, int offset, struct tm *result) { time64_to_tm(totalsecs, offset, result); } static inline unsigned long mktime(const unsigned int year, const unsigned int mon, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec) { return mktime64(year, mon, day, hour, min, sec); } static inline bool timeval_valid(const struct timeval *tv) { /* Dates before 1970 are bogus */ if (tv->tv_sec < 0) return false; /* Can't have more microseconds then a second */ if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) return false; return true; } extern struct timespec timespec_trunc(struct timespec t, unsigned int gran); /** * timeval_to_ns - Convert timeval to nanoseconds * @ts: pointer to the timeval variable to be converted * * Returns the scalar nanosecond representation of the timeval * parameter. */ static inline s64 timeval_to_ns(const struct timeval *tv) { return ((s64) tv->tv_sec * NSEC_PER_SEC) + tv->tv_usec * NSEC_PER_USEC; } /** * ns_to_timeval - Convert nanoseconds to timeval * @nsec: the nanoseconds value to be converted * * Returns the timeval representation of the nsec parameter. */ extern struct timeval ns_to_timeval(const s64 nsec); extern struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec); /* * New aliases for compat time functions. These will be used to replace * the compat code so it can be shared between 32-bit and 64-bit builds * both of which provide compatibility with old 32-bit tasks. */ #define old_time32_t compat_time_t #define old_timeval32 compat_timeval #define old_timespec32 compat_timespec #define old_itimerspec32 compat_itimerspec #define ns_to_old_timeval32 ns_to_compat_timeval #define get_old_itimerspec32 get_compat_itimerspec64 #define put_old_itimerspec32 put_compat_itimerspec64 #define get_old_timespec32 compat_get_timespec64 #define put_old_timespec32 compat_put_timespec64 #endif |
9 1175 29 29 29 29 29 29 29 56 56 56 56 12 9 9 9 9 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 | /* * scsi.c Copyright (C) 1992 Drew Eckhardt * Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale * Copyright (C) 2002, 2003 Christoph Hellwig * * generic mid-level SCSI driver * Initial versions: Drew Eckhardt * Subsequent revisions: Eric Youngdale * * <drew@colorado.edu> * * Bug correction thanks go to : * Rik Faith <faith@cs.unc.edu> * Tommy Thorn <tthorn> * Thomas Wuensche <tw@fgb1.fgb.mw.tu-muenchen.de> * * Modified by Eric Youngdale eric@andante.org or ericy@gnu.ai.mit.edu to * add scatter-gather, multiple outstanding request, and other * enhancements. * * Native multichannel, wide scsi, /proc/scsi and hot plugging * support added by Michael Neuffer <mike@i-connect.net> * * Added request_module("scsi_hostadapter") for kerneld: * (Put an "alias scsi_hostadapter your_hostadapter" in /etc/modprobe.conf) * Bjorn Ekwall <bj0rn@blox.se> * (changed to kmod) * * Major improvements to the timeout, abort, and reset processing, * as well as performance modifications for large queue depths by * Leonard N. Zubkoff <lnz@dandelion.com> * * Converted cli() code to spinlocks, Ingo Molnar * * Jiffies wrap fixes (host->resetting), 3 Dec 1998 Andrea Arcangeli * * out_of_space hacks, D. Gilbert (dpg) 990608 */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/unistd.h> #include <linux/spinlock.h> #include <linux/kmod.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/async.h> #include <asm/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_tcq.h> #include "scsi_priv.h" #include "scsi_logging.h" #define CREATE_TRACE_POINTS #include <trace/events/scsi.h> /* * Definitions and constants. */ /* * Note - the initial logging level can be set here to log events at boot time. * After the system is up, you may enable logging via the /proc interface. */ unsigned int scsi_logging_level; #if defined(CONFIG_SCSI_LOGGING) EXPORT_SYMBOL(scsi_logging_level); #endif /* sd, scsi core and power management need to coordinate flushing async actions */ ASYNC_DOMAIN(scsi_sd_probe_domain); EXPORT_SYMBOL(scsi_sd_probe_domain); /* * Separate domain (from scsi_sd_probe_domain) to maximize the benefit of * asynchronous system resume operations. It is marked 'exclusive' to avoid * being included in the async_synchronize_full() that is invoked by * dpm_resume() */ ASYNC_DOMAIN_EXCLUSIVE(scsi_sd_pm_domain); EXPORT_SYMBOL(scsi_sd_pm_domain); /** * scsi_put_command - Free a scsi command block * @cmd: command block to free * * Returns: Nothing. * * Notes: The command must not belong to any lists. */ void scsi_put_command(struct scsi_cmnd *cmd) { scsi_del_cmd_from_list(cmd); BUG_ON(delayed_work_pending(&cmd->abort_work)); } #ifdef CONFIG_SCSI_LOGGING void scsi_log_send(struct scsi_cmnd *cmd) { unsigned int level; /* * If ML QUEUE log level is greater than or equal to: * * 1: nothing (match completion) * * 2: log opcode + command of all commands + cmd address * * 3: same as 2 * * 4: same as 3 */ if (unlikely(scsi_logging_level)) { level = SCSI_LOG_LEVEL(SCSI_LOG_MLQUEUE_SHIFT, SCSI_LOG_MLQUEUE_BITS); if (level > 1) { scmd_printk(KERN_INFO, cmd, "Send: scmd 0x%p\n", cmd); scsi_print_command(cmd); } } } void scsi_log_completion(struct scsi_cmnd *cmd, int disposition) { unsigned int level; /* * If ML COMPLETE log level is greater than or equal to: * * 1: log disposition, result, opcode + command, and conditionally * sense data for failures or non SUCCESS dispositions. * * 2: same as 1 but for all command completions. * * 3: same as 2 * * 4: same as 3 plus dump extra junk */ if (unlikely(scsi_logging_level)) { level = SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT, SCSI_LOG_MLCOMPLETE_BITS); if (((level > 0) && (cmd->result || disposition != SUCCESS)) || (level > 1)) { scsi_print_result(cmd, "Done", disposition); scsi_print_command(cmd); if (status_byte(cmd->result) == CHECK_CONDITION) scsi_print_sense(cmd); if (level > 3) scmd_printk(KERN_INFO, cmd, "scsi host busy %d failed %d\n", scsi_host_busy(cmd->device->host), cmd->device->host->host_failed); } } } #endif /** * scsi_cmd_get_serial - Assign a serial number to a command * @host: the scsi host * @cmd: command to assign serial number to * * Description: a serial number identifies a request for error recovery * and debugging purposes. Protected by the Host_Lock of host. */ void scsi_cmd_get_serial(struct Scsi_Host *host, struct scsi_cmnd *cmd) { cmd->serial_number = host->cmd_serial_number++; if (cmd->serial_number == 0) cmd->serial_number = host->cmd_serial_number++; } EXPORT_SYMBOL(scsi_cmd_get_serial); /** * scsi_finish_command - cleanup and pass command back to upper layer * @cmd: the command * * Description: Pass command off to upper layer for finishing of I/O * request, waking processes that are waiting on results, * etc. */ void scsi_finish_command(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct scsi_target *starget = scsi_target(sdev); struct Scsi_Host *shost = sdev->host; struct scsi_driver *drv; unsigned int good_bytes; scsi_device_unbusy(sdev); /* * Clear the flags that say that the device/target/host is no longer * capable of accepting new commands. */ if (atomic_read(&shost->host_blocked)) atomic_set(&shost->host_blocked, 0); if (atomic_read(&starget->target_blocked)) atomic_set(&starget->target_blocked, 0); if (atomic_read(&sdev->device_blocked)) atomic_set(&sdev->device_blocked, 0); /* * If we have valid sense information, then some kind of recovery * must have taken place. Make a note of this. */ if (SCSI_SENSE_VALID(cmd)) cmd->result |= (DRIVER_SENSE << 24); SCSI_LOG_MLCOMPLETE(4, sdev_printk(KERN_INFO, sdev, "Notifying upper driver of completion " "(result %x)\n", cmd->result)); good_bytes = scsi_bufflen(cmd); if (!blk_rq_is_passthrough(cmd->request)) { int old_good_bytes = good_bytes; drv = scsi_cmd_to_driver(cmd); if (drv->done) good_bytes = drv->done(cmd); /* * USB may not give sense identifying bad sector and * simply return a residue instead, so subtract off the * residue if drv->done() error processing indicates no * change to the completion length. */ if (good_bytes == old_good_bytes) good_bytes -= scsi_get_resid(cmd); } scsi_io_completion(cmd, good_bytes); } /** * scsi_change_queue_depth - change a device's queue depth * @sdev: SCSI Device in question * @depth: number of commands allowed to be queued to the driver * * Sets the device queue depth and returns the new value. */ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) { if (depth > 0) { sdev->queue_depth = depth; wmb(); } if (sdev->request_queue) blk_set_queue_depth(sdev->request_queue, depth); return sdev->queue_depth; } EXPORT_SYMBOL(scsi_change_queue_depth); /** * scsi_track_queue_full - track QUEUE_FULL events to adjust queue depth * @sdev: SCSI Device in question * @depth: Current number of outstanding SCSI commands on this device, * not counting the one returned as QUEUE_FULL. * * Description: This function will track successive QUEUE_FULL events on a * specific SCSI device to determine if and when there is a * need to adjust the queue depth on the device. * * Returns: 0 - No change needed, >0 - Adjust queue depth to this new depth, * -1 - Drop back to untagged operation using host->cmd_per_lun * as the untagged command depth * * Lock Status: None held on entry * * Notes: Low level drivers may call this at any time and we will do * "The Right Thing." We are interrupt context safe. */ int scsi_track_queue_full(struct scsi_device *sdev, int depth) { /* * Don't let QUEUE_FULLs on the same * jiffies count, they could all be from * same event. */ if ((jiffies >> 4) == (sdev->last_queue_full_time >> 4)) return 0; sdev->last_queue_full_time = jiffies; if (sdev->last_queue_full_depth != depth) { sdev->last_queue_full_count = 1; sdev->last_queue_full_depth = depth; } else { sdev->last_queue_full_count++; } if (sdev->last_queue_full_count <= 10) return 0; return scsi_change_queue_depth(sdev, depth); } EXPORT_SYMBOL(scsi_track_queue_full); /** * scsi_vpd_inquiry - Request a device provide us with a VPD page * @sdev: The device to ask * @buffer: Where to put the result * @page: Which Vital Product Data to return * @len: The length of the buffer * * This is an internal helper function. You probably want to use * scsi_get_vpd_page instead. * * Returns size of the vpd page on success or a negative error number. */ static int scsi_vpd_inquiry(struct scsi_device *sdev, unsigned char *buffer, u8 page, unsigned len) { int result; unsigned char cmd[16]; if (len < 4) return -EINVAL; cmd[0] = INQUIRY; cmd[1] = 1; /* EVPD */ cmd[2] = page; cmd[3] = len >> 8; cmd[4] = len & 0xff; cmd[5] = 0; /* Control byte */ /* * I'm not convinced we need to try quite this hard to get VPD, but * all the existing users tried this hard. */ result = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buffer, len, NULL, 30 * HZ, 3, NULL); if (result) return -EIO; /* Sanity check that we got the page back that we asked for */ if (buffer[1] != page) return -EIO; return get_unaligned_be16(&buffer[2]) + 4; } /** * scsi_get_vpd_page - Get Vital Product Data from a SCSI device * @sdev: The device to ask * @page: Which Vital Product Data to return * @buf: where to store the VPD * @buf_len: number of bytes in the VPD buffer area * * SCSI devices may optionally supply Vital Product Data. Each 'page' * of VPD is defined in the appropriate SCSI document (eg SPC, SBC). * If the device supports this VPD page, this routine returns a pointer * to a buffer containing the data from that page. The caller is * responsible for calling kfree() on this pointer when it is no longer * needed. If we cannot retrieve the VPD page this routine returns %NULL. */ int scsi_get_vpd_page(struct scsi_device *sdev, u8 page, unsigned char *buf, int buf_len) { int i, result; if (sdev->skip_vpd_pages) goto fail; /* Ask for all the pages supported by this device */ result = scsi_vpd_inquiry(sdev, buf, 0, buf_len); if (result < 4) goto fail; /* If the user actually wanted this page, we can skip the rest */ if (page == 0) return 0; for (i = 4; i < min(result, buf_len); i++) if (buf[i] == page) goto found; if (i < result && i >= buf_len) /* ran off the end of the buffer, give us benefit of doubt */ goto found; /* The device claims it doesn't support the requested page */ goto fail; found: result = scsi_vpd_inquiry(sdev, buf, page, buf_len); if (result < 0) goto fail; return 0; fail: return -EINVAL; } EXPORT_SYMBOL_GPL(scsi_get_vpd_page); /** * scsi_get_vpd_buf - Get Vital Product Data from a SCSI device * @sdev: The device to ask * @page: Which Vital Product Data to return * * Returns %NULL upon failure. */ static struct scsi_vpd *scsi_get_vpd_buf(struct scsi_device *sdev, u8 page) { struct scsi_vpd *vpd_buf; int vpd_len = SCSI_VPD_PG_LEN, result; retry_pg: vpd_buf = kmalloc(sizeof(*vpd_buf) + vpd_len, GFP_KERNEL); if (!vpd_buf) return NULL; result = scsi_vpd_inquiry(sdev, vpd_buf->data, page, vpd_len); if (result < 0) { kfree(vpd_buf); return NULL; } if (result > vpd_len) { vpd_len = result; kfree(vpd_buf); goto retry_pg; } vpd_buf->len = result; return vpd_buf; } static void scsi_update_vpd_page(struct scsi_device *sdev, u8 page, struct scsi_vpd __rcu **sdev_vpd_buf) { struct scsi_vpd *vpd_buf; vpd_buf = scsi_get_vpd_buf(sdev, page); if (!vpd_buf) return; mutex_lock(&sdev->inquiry_mutex); rcu_swap_protected(*sdev_vpd_buf, vpd_buf, lockdep_is_held(&sdev->inquiry_mutex)); mutex_unlock(&sdev->inquiry_mutex); if (vpd_buf) kfree_rcu(vpd_buf, rcu); } /** * scsi_attach_vpd - Attach Vital Product Data to a SCSI device structure * @sdev: The device to ask * * Attach the 'Device Identification' VPD page (0x83) and the * 'Unit Serial Number' VPD page (0x80) to a SCSI device * structure. This information can be used to identify the device * uniquely. */ void scsi_attach_vpd(struct scsi_device *sdev) { int i; struct scsi_vpd *vpd_buf; if (!scsi_device_supports_vpd(sdev)) return; /* Ask for all the pages supported by this device */ vpd_buf = scsi_get_vpd_buf(sdev, 0); if (!vpd_buf) return; for (i = 4; i < vpd_buf->len; i++) { if (vpd_buf->data[i] == 0x80) scsi_update_vpd_page(sdev, 0x80, &sdev->vpd_pg80); if (vpd_buf->data[i] == 0x83) scsi_update_vpd_page(sdev, 0x83, &sdev->vpd_pg83); } kfree(vpd_buf); } /** * scsi_report_opcode - Find out if a given command opcode is supported * @sdev: scsi device to query * @buffer: scratch buffer (must be at least 20 bytes long) * @len: length of buffer * @opcode: opcode for command to look up * * Uses the REPORT SUPPORTED OPERATION CODES to look up the given * opcode. Returns -EINVAL if RSOC fails, 0 if the command opcode is * unsupported and 1 if the device claims to support the command. */ int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer, unsigned int len, unsigned char opcode) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; int result; if (sdev->no_report_opcodes || sdev->scsi_level < SCSI_SPC_3) return -EINVAL; memset(cmd, 0, 16); cmd[0] = MAINTENANCE_IN; cmd[1] = MI_REPORT_SUPPORTED_OPERATION_CODES; cmd[2] = 1; /* One command format */ cmd[3] = opcode; put_unaligned_be32(len, &cmd[6]); memset(buffer, 0, len); result = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buffer, len, &sshdr, 30 * HZ, 3, NULL); if (result && scsi_sense_valid(&sshdr) && sshdr.sense_key == ILLEGAL_REQUEST && (sshdr.asc == 0x20 || sshdr.asc == 0x24) && sshdr.ascq == 0x00) return -EINVAL; if ((buffer[1] & 3) == 3) /* Command supported */ return 1; return 0; } EXPORT_SYMBOL(scsi_report_opcode); /** * scsi_device_get - get an additional reference to a scsi_device * @sdev: device to get a reference to * * Description: Gets a reference to the scsi_device and increments the use count * of the underlying LLDD module. You must hold host_lock of the * parent Scsi_Host or already have a reference when calling this. * * This will fail if a device is deleted or cancelled, or when the LLD module * is in the process of being unloaded. */ int scsi_device_get(struct scsi_device *sdev) { if (sdev->sdev_state == SDEV_DEL || sdev->sdev_state == SDEV_CANCEL) goto fail; if (!get_device(&sdev->sdev_gendev)) goto fail; if (!try_module_get(sdev->host->hostt->module)) goto fail_put_device; return 0; fail_put_device: put_device(&sdev->sdev_gendev); fail: return -ENXIO; } EXPORT_SYMBOL(scsi_device_get); /** * scsi_device_put - release a reference to a scsi_device * @sdev: device to release a reference on. * * Description: Release a reference to the scsi_device and decrements the use * count of the underlying LLDD module. The device is freed once the last * user vanishes. */ void scsi_device_put(struct scsi_device *sdev) { struct module *mod = sdev->host->hostt->module; put_device(&sdev->sdev_gendev); module_put(mod); } EXPORT_SYMBOL(scsi_device_put); /* helper for shost_for_each_device, see that for documentation */ struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *shost, struct scsi_device *prev) { struct list_head *list = (prev ? &prev->siblings : &shost->__devices); struct scsi_device *next = NULL; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); while (list->next != &shost->__devices) { next = list_entry(list->next, struct scsi_device, siblings); /* skip devices that we can't get a reference to */ if (!scsi_device_get(next)) break; next = NULL; list = list->next; } spin_unlock_irqrestore(shost->host_lock, flags); if (prev) scsi_device_put(prev); return next; } EXPORT_SYMBOL(__scsi_iterate_devices); /** * starget_for_each_device - helper to walk all devices of a target * @starget: target whose devices we want to iterate over. * @data: Opaque passed to each function call. * @fn: Function to call on each device * * This traverses over each device of @starget. The devices have * a reference that must be released by scsi_host_put when breaking * out of the loop. */ void starget_for_each_device(struct scsi_target *starget, void *data, void (*fn)(struct scsi_device *, void *)) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct scsi_device *sdev; shost_for_each_device(sdev, shost) { if ((sdev->channel == starget->channel) && (sdev->id == starget->id)) fn(sdev, data); } } EXPORT_SYMBOL(starget_for_each_device); /** * __starget_for_each_device - helper to walk all devices of a target (UNLOCKED) * @starget: target whose devices we want to iterate over. * @data: parameter for callback @fn() * @fn: callback function that is invoked for each device * * This traverses over each device of @starget. It does _not_ * take a reference on the scsi_device, so the whole loop must be * protected by shost->host_lock. * * Note: The only reason why drivers would want to use this is because * they need to access the device list in irq context. Otherwise you * really want to use starget_for_each_device instead. **/ void __starget_for_each_device(struct scsi_target *starget, void *data, void (*fn)(struct scsi_device *, void *)) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct scsi_device *sdev; __shost_for_each_device(sdev, shost) { if ((sdev->channel == starget->channel) && (sdev->id == starget->id)) fn(sdev, data); } } EXPORT_SYMBOL(__starget_for_each_device); /** * __scsi_device_lookup_by_target - find a device given the target (UNLOCKED) * @starget: SCSI target pointer * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @lun for a given * @starget. The returned scsi_device does not have an additional * reference. You must hold the host's host_lock over this call and * any access to the returned scsi_device. A scsi_device in state * SDEV_DEL is skipped. * * Note: The only reason why drivers should use this is because * they need to access the device list in irq context. Otherwise you * really want to use scsi_device_lookup_by_target instead. **/ struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *starget, u64 lun) { struct scsi_device *sdev; list_for_each_entry(sdev, &starget->devices, same_target_siblings) { if (sdev->sdev_state == SDEV_DEL) continue; if (sdev->lun ==lun) return sdev; } return NULL; } EXPORT_SYMBOL(__scsi_device_lookup_by_target); /** * scsi_device_lookup_by_target - find a device given the target * @starget: SCSI target pointer * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @lun for a given * @starget. The returned scsi_device has an additional reference that * needs to be released with scsi_device_put once you're done with it. **/ struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *starget, u64 lun) { struct scsi_device *sdev; struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); sdev = __scsi_device_lookup_by_target(starget, lun); if (sdev && scsi_device_get(sdev)) sdev = NULL; spin_unlock_irqrestore(shost->host_lock, flags); return sdev; } EXPORT_SYMBOL(scsi_device_lookup_by_target); /** * __scsi_device_lookup - find a device given the host (UNLOCKED) * @shost: SCSI host pointer * @channel: SCSI channel (zero if only one channel) * @id: SCSI target number (physical unit number) * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @channel, @id, @lun * for a given host. The returned scsi_device does not have an additional * reference. You must hold the host's host_lock over this call and any access * to the returned scsi_device. * * Note: The only reason why drivers would want to use this is because * they need to access the device list in irq context. Otherwise you * really want to use scsi_device_lookup instead. **/ struct scsi_device *__scsi_device_lookup(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct scsi_device *sdev; list_for_each_entry(sdev, &shost->__devices, siblings) { if (sdev->sdev_state == SDEV_DEL) continue; if (sdev->channel == channel && sdev->id == id && sdev->lun ==lun) return sdev; } return NULL; } EXPORT_SYMBOL(__scsi_device_lookup); /** * scsi_device_lookup - find a device given the host * @shost: SCSI host pointer * @channel: SCSI channel (zero if only one channel) * @id: SCSI target number (physical unit number) * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @channel, @id, @lun * for a given host. The returned scsi_device has an additional reference that * needs to be released with scsi_device_put once you're done with it. **/ struct scsi_device *scsi_device_lookup(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct scsi_device *sdev; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); sdev = __scsi_device_lookup(shost, channel, id, lun); if (sdev && scsi_device_get(sdev)) sdev = NULL; spin_unlock_irqrestore(shost->host_lock, flags); return sdev; } EXPORT_SYMBOL(scsi_device_lookup); MODULE_DESCRIPTION("SCSI core"); MODULE_LICENSE("GPL"); module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels"); #ifdef CONFIG_SCSI_MQ_DEFAULT bool scsi_use_blk_mq = true; #else bool scsi_use_blk_mq = false; #endif module_param_named(use_blk_mq, scsi_use_blk_mq, bool, S_IWUSR | S_IRUGO); static int __init init_scsi(void) { int error; error = scsi_init_queue(); if (error) return error; error = scsi_init_procfs(); if (error) goto cleanup_queue; error = scsi_init_devinfo(); if (error) goto cleanup_procfs; error = scsi_init_hosts(); if (error) goto cleanup_devlist; error = scsi_init_sysctl(); if (error) goto cleanup_hosts; error = scsi_sysfs_register(); if (error) goto cleanup_sysctl; scsi_netlink_init(); printk(KERN_NOTICE "SCSI subsystem initialized\n"); return 0; cleanup_sysctl: scsi_exit_sysctl(); cleanup_hosts: scsi_exit_hosts(); cleanup_devlist: scsi_exit_devinfo(); cleanup_procfs: scsi_exit_procfs(); cleanup_queue: scsi_exit_queue(); printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n", -error); return error; } static void __exit exit_scsi(void) { scsi_netlink_exit(); scsi_sysfs_unregister(); scsi_exit_sysctl(); scsi_exit_hosts(); scsi_exit_devinfo(); scsi_exit_procfs(); scsi_exit_queue(); async_unregister_domain(&scsi_sd_probe_domain); } subsys_initcall(init_scsi); module_exit(exit_scsi); |
8 4 8 8 8 10 8 2 8 4 4 4 6 2 2 2 2 2 2 2 6 8 8 8 8 8 8 11 11 11 10 2 8 4 4 8 3 11 3 2 2 2 2 1 1 1 2 3 1 1 2 2 3 1 2 3 11 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 | /* * linux/fs/sysv/inode.c * * minix/inode.c * Copyright (C) 1991, 1992 Linus Torvalds * * xenix/inode.c * Copyright (C) 1992 Doug Evans * * coh/inode.c * Copyright (C) 1993 Pascal Haible, Bruno Haible * * sysv/inode.c * Copyright (C) 1993 Paul B. Monday * * sysv/inode.c * Copyright (C) 1993 Bruno Haible * Copyright (C) 1997, 1998 Krzysztof G. Baranowski * * This file contains code for read/parsing the superblock. */ #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/buffer_head.h> #include "sysv.h" /* * The following functions try to recognize specific filesystems. * * We recognize: * - Xenix FS by its magic number. * - SystemV FS by its magic number. * - Coherent FS by its funny fname/fpack field. * - SCO AFS by s_nfree == 0xffff * - V7 FS has no distinguishing features. * * We discriminate among SystemV4 and SystemV2 FS by the assumption that * the time stamp is not < 01-01-1980. */ enum { JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60 }; static void detected_xenix(struct sysv_sb_info *sbi, unsigned *max_links) { struct buffer_head *bh1 = sbi->s_bh1; struct buffer_head *bh2 = sbi->s_bh2; struct xenix_super_block * sbd1; struct xenix_super_block * sbd2; if (bh1 != bh2) sbd1 = sbd2 = (struct xenix_super_block *) bh1->b_data; else { /* block size = 512, so bh1 != bh2 */ sbd1 = (struct xenix_super_block *) bh1->b_data; sbd2 = (struct xenix_super_block *) (bh2->b_data - 512); } *max_links = XENIX_LINK_MAX; sbi->s_fic_size = XENIX_NICINOD; sbi->s_flc_size = XENIX_NICFREE; sbi->s_sbd1 = (char *)sbd1; sbi->s_sbd2 = (char *)sbd2; sbi->s_sb_fic_count = &sbd1->s_ninode; sbi->s_sb_fic_inodes = &sbd1->s_inode[0]; sbi->s_sb_total_free_inodes = &sbd2->s_tinode; sbi->s_bcache_count = &sbd1->s_nfree; sbi->s_bcache = &sbd1->s_free[0]; sbi->s_free_blocks = &sbd2->s_tfree; sbi->s_sb_time = &sbd2->s_time; sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd1->s_isize); sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize); } static void detected_sysv4(struct sysv_sb_info *sbi, unsigned *max_links) { struct sysv4_super_block * sbd; struct buffer_head *bh1 = sbi->s_bh1; struct buffer_head *bh2 = sbi->s_bh2; if (bh1 == bh2) sbd = (struct sysv4_super_block *) (bh1->b_data + BLOCK_SIZE/2); else sbd = (struct sysv4_super_block *) bh2->b_data; *max_links = SYSV_LINK_MAX; sbi->s_fic_size = SYSV_NICINOD; sbi->s_flc_size = SYSV_NICFREE; sbi->s_sbd1 = (char *)sbd; sbi->s_sbd2 = (char *)sbd; sbi->s_sb_fic_count = &sbd->s_ninode; sbi->s_sb_fic_inodes = &sbd->s_inode[0]; sbi->s_sb_total_free_inodes = &sbd->s_tinode; sbi->s_bcache_count = &sbd->s_nfree; sbi->s_bcache = &sbd->s_free[0]; sbi->s_free_blocks = &sbd->s_tfree; sbi->s_sb_time = &sbd->s_time; sbi->s_sb_state = &sbd->s_state; sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); } static void detected_sysv2(struct sysv_sb_info *sbi, unsigned *max_links) { struct sysv2_super_block *sbd; struct buffer_head *bh1 = sbi->s_bh1; struct buffer_head *bh2 = sbi->s_bh2; if (bh1 == bh2) sbd = (struct sysv2_super_block *) (bh1->b_data + BLOCK_SIZE/2); else sbd = (struct sysv2_super_block *) bh2->b_data; *max_links = SYSV_LINK_MAX; sbi->s_fic_size = SYSV_NICINOD; sbi->s_flc_size = SYSV_NICFREE; sbi->s_sbd1 = (char *)sbd; sbi->s_sbd2 = (char *)sbd; sbi->s_sb_fic_count = &sbd->s_ninode; sbi->s_sb_fic_inodes = &sbd->s_inode[0]; sbi->s_sb_total_free_inodes = &sbd->s_tinode; sbi->s_bcache_count = &sbd->s_nfree; sbi->s_bcache = &sbd->s_free[0]; sbi->s_free_blocks = &sbd->s_tfree; sbi->s_sb_time = &sbd->s_time; sbi->s_sb_state = &sbd->s_state; sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); } static void detected_coherent(struct sysv_sb_info *sbi, unsigned *max_links) { struct coh_super_block * sbd; struct buffer_head *bh1 = sbi->s_bh1; sbd = (struct coh_super_block *) bh1->b_data; *max_links = COH_LINK_MAX; sbi->s_fic_size = COH_NICINOD; sbi->s_flc_size = COH_NICFREE; sbi->s_sbd1 = (char *)sbd; sbi->s_sbd2 = (char *)sbd; sbi->s_sb_fic_count = &sbd->s_ninode; sbi->s_sb_fic_inodes = &sbd->s_inode[0]; sbi->s_sb_total_free_inodes = &sbd->s_tinode; sbi->s_bcache_count = &sbd->s_nfree; sbi->s_bcache = &sbd->s_free[0]; sbi->s_free_blocks = &sbd->s_tfree; sbi->s_sb_time = &sbd->s_time; sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); } static void detected_v7(struct sysv_sb_info *sbi, unsigned *max_links) { struct buffer_head *bh2 = sbi->s_bh2; struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data; *max_links = V7_LINK_MAX; sbi->s_fic_size = V7_NICINOD; sbi->s_flc_size = V7_NICFREE; sbi->s_sbd1 = (char *)sbd; sbi->s_sbd2 = (char *)sbd; sbi->s_sb_fic_count = &sbd->s_ninode; sbi->s_sb_fic_inodes = &sbd->s_inode[0]; sbi->s_sb_total_free_inodes = &sbd->s_tinode; sbi->s_bcache_count = &sbd->s_nfree; sbi->s_bcache = &sbd->s_free[0]; sbi->s_free_blocks = &sbd->s_tfree; sbi->s_sb_time = &sbd->s_time; sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); } static int detect_xenix(struct sysv_sb_info *sbi, struct buffer_head *bh) { struct xenix_super_block *sbd = (struct xenix_super_block *)bh->b_data; if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0x2b5544)) sbi->s_bytesex = BYTESEX_LE; else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0x2b5544)) sbi->s_bytesex = BYTESEX_BE; else return 0; switch (fs32_to_cpu(sbi, sbd->s_type)) { case 1: sbi->s_type = FSTYPE_XENIX; return 1; case 2: sbi->s_type = FSTYPE_XENIX; return 2; default: return 0; } } static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh) { struct super_block *sb = sbi->s_sb; /* All relevant fields are at the same offsets in R2 and R4 */ struct sysv4_super_block * sbd; u32 type; sbd = (struct sysv4_super_block *) (bh->b_data + BLOCK_SIZE/2); if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0xfd187e20)) sbi->s_bytesex = BYTESEX_LE; else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0xfd187e20)) sbi->s_bytesex = BYTESEX_BE; else return 0; type = fs32_to_cpu(sbi, sbd->s_type); if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) { sbi->s_type = FSTYPE_AFS; sbi->s_forced_ro = 1; if (!sb_rdonly(sb)) { printk("SysV FS: SCO EAFS on %s detected, " "forcing read-only mode.\n", sb->s_id); } return type; } if (fs32_to_cpu(sbi, sbd->s_time) < JAN_1_1980) { /* this is likely to happen on SystemV2 FS */ if (type > 3 || type < 1) return 0; sbi->s_type = FSTYPE_SYSV2; return type; } if ((type > 3 || type < 1) && (type > 0x30 || type < 0x10)) return 0; /* On Interactive Unix (ISC) Version 4.0/3.x s_type field = 0x10, 0x20 or 0x30 indicates that symbolic links and the 14-character filename limit is gone. Due to lack of information about this feature read-only mode seems to be a reasonable approach... -KGB */ if (type >= 0x10) { printk("SysV FS: can't handle long file names on %s, " "forcing read-only mode.\n", sb->s_id); sbi->s_forced_ro = 1; } sbi->s_type = FSTYPE_SYSV4; return type >= 0x10 ? type >> 4 : type; } static int detect_coherent(struct sysv_sb_info *sbi, struct buffer_head *bh) { struct coh_super_block * sbd; sbd = (struct coh_super_block *) (bh->b_data + BLOCK_SIZE/2); if ((memcmp(sbd->s_fname,"noname",6) && memcmp(sbd->s_fname,"xxxxx ",6)) || (memcmp(sbd->s_fpack,"nopack",6) && memcmp(sbd->s_fpack,"xxxxx\n",6))) return 0; sbi->s_bytesex = BYTESEX_PDP; sbi->s_type = FSTYPE_COH; return 1; } static int detect_sysv_odd(struct sysv_sb_info *sbi, struct buffer_head *bh) { int size = detect_sysv(sbi, bh); return size>2 ? 0 : size; } static struct { int block; int (*test)(struct sysv_sb_info *, struct buffer_head *); } flavours[] = { {1, detect_xenix}, {0, detect_sysv}, {0, detect_coherent}, {9, detect_sysv_odd}, {15,detect_sysv_odd}, {18,detect_sysv}, }; static char *flavour_names[] = { [FSTYPE_XENIX] = "Xenix", [FSTYPE_SYSV4] = "SystemV", [FSTYPE_SYSV2] = "SystemV Release 2", [FSTYPE_COH] = "Coherent", [FSTYPE_V7] = "V7", [FSTYPE_AFS] = "AFS", }; static void (*flavour_setup[])(struct sysv_sb_info *, unsigned *) = { [FSTYPE_XENIX] = detected_xenix, [FSTYPE_SYSV4] = detected_sysv4, [FSTYPE_SYSV2] = detected_sysv2, [FSTYPE_COH] = detected_coherent, [FSTYPE_V7] = detected_v7, [FSTYPE_AFS] = detected_sysv4, }; static int complete_read_super(struct super_block *sb, int silent, int size) { struct sysv_sb_info *sbi = SYSV_SB(sb); struct inode *root_inode; char *found = flavour_names[sbi->s_type]; u_char n_bits = size+8; int bsize = 1 << n_bits; int bsize_4 = bsize >> 2; sbi->s_firstinodezone = 2; flavour_setup[sbi->s_type](sbi, &sb->s_max_links); sbi->s_truncate = 1; sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone; sbi->s_inodes_per_block = bsize >> 6; sbi->s_inodes_per_block_1 = (bsize >> 6)-1; sbi->s_inodes_per_block_bits = n_bits-6; sbi->s_ind_per_block = bsize_4; sbi->s_ind_per_block_2 = bsize_4*bsize_4; sbi->s_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4)); sbi->s_ind_per_block_bits = n_bits-2; sbi->s_ninodes = (sbi->s_firstdatazone - sbi->s_firstinodezone) << sbi->s_inodes_per_block_bits; if (!silent) printk("VFS: Found a %s FS (block size = %ld) on device %s\n", found, sb->s_blocksize, sb->s_id); sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type; /* set up enough so that it can read an inode */ sb->s_op = &sysv_sops; if (sbi->s_forced_ro) sb->s_flags |= SB_RDONLY; if (sbi->s_truncate) sb->s_d_op = &sysv_dentry_operations; root_inode = sysv_iget(sb, SYSV_ROOT_INO); if (IS_ERR(root_inode)) { printk("SysV FS: get root inode failed\n"); return 0; } sb->s_root = d_make_root(root_inode); if (!sb->s_root) { printk("SysV FS: get root dentry failed\n"); return 0; } return 1; } static int sysv_fill_super(struct super_block *sb, void *data, int silent) { struct buffer_head *bh1, *bh = NULL; struct sysv_sb_info *sbi; unsigned long blocknr; int size = 0, i; BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block)); BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block)); BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block)); BUILD_BUG_ON(500 != sizeof (struct coh_super_block)); BUILD_BUG_ON(64 != sizeof (struct sysv_inode)); sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sbi->s_sb = sb; sbi->s_block_base = 0; mutex_init(&sbi->s_lock); sb->s_fs_info = sbi; sb_set_blocksize(sb, BLOCK_SIZE); for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) { brelse(bh); bh = sb_bread(sb, flavours[i].block); if (!bh) continue; size = flavours[i].test(SYSV_SB(sb), bh); } if (!size) goto Eunknown; switch (size) { case 1: blocknr = bh->b_blocknr << 1; brelse(bh); sb_set_blocksize(sb, 512); bh1 = sb_bread(sb, blocknr); bh = sb_bread(sb, blocknr + 1); break; case 2: bh1 = bh; break; case 3: blocknr = bh->b_blocknr >> 1; brelse(bh); sb_set_blocksize(sb, 2048); bh1 = bh = sb_bread(sb, blocknr); break; default: goto Ebadsize; } if (bh && bh1) { sbi->s_bh1 = bh1; sbi->s_bh2 = bh; if (complete_read_super(sb, silent, size)) return 0; } brelse(bh1); brelse(bh); sb_set_blocksize(sb, BLOCK_SIZE); printk("oldfs: cannot read superblock\n"); failed: kfree(sbi); return -EINVAL; Eunknown: brelse(bh); if (!silent) printk("VFS: unable to find oldfs superblock on device %s\n", sb->s_id); goto failed; Ebadsize: brelse(bh); if (!silent) printk("VFS: oldfs: unsupported block size (%dKb)\n", 1<<(size-2)); goto failed; } static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh) { struct v7_super_block *v7sb; struct sysv_inode *v7i; struct buffer_head *bh2; struct sysv_sb_info *sbi; sbi = sb->s_fs_info; /* plausibility check on superblock */ v7sb = (struct v7_super_block *) bh->b_data; if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE || fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD || fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE) return 0; /* plausibility check on root inode: it is a directory, with a nonzero size that is a multiple of 16 */ bh2 = sb_bread(sb, 2); if (bh2 == NULL) return 0; v7i = (struct sysv_inode *)(bh2->b_data + 64); if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR || (fs32_to_cpu(sbi, v7i->i_size) == 0) || (fs32_to_cpu(sbi, v7i->i_size) & 017) || (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES * sizeof(struct sysv_dir_entry))) { brelse(bh2); return 0; } brelse(bh2); return 1; } static int v7_fill_super(struct super_block *sb, void *data, int silent) { struct sysv_sb_info *sbi; struct buffer_head *bh; if (440 != sizeof (struct v7_super_block)) panic("V7 FS: bad super-block size"); if (64 != sizeof (struct sysv_inode)) panic("sysv fs: bad i-node size"); sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sbi->s_sb = sb; sbi->s_block_base = 0; sbi->s_type = FSTYPE_V7; mutex_init(&sbi->s_lock); sb->s_fs_info = sbi; sb_set_blocksize(sb, 512); if ((bh = sb_bread(sb, 1)) == NULL) { if (!silent) printk("VFS: unable to read V7 FS superblock on " "device %s.\n", sb->s_id); goto failed; } /* Try PDP-11 UNIX */ sbi->s_bytesex = BYTESEX_PDP; if (v7_sanity_check(sb, bh)) goto detected; /* Try PC/IX, v7/x86 */ sbi->s_bytesex = BYTESEX_LE; if (v7_sanity_check(sb, bh)) goto detected; goto failed; detected: sbi->s_bh1 = bh; sbi->s_bh2 = bh; if (complete_read_super(sb, silent, 1)) return 0; failed: printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n", sb->s_id); brelse(bh); kfree(sbi); return -EINVAL; } /* Every kernel module contains stuff like this. */ static struct dentry *sysv_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super); } static struct dentry *v7_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super); } static struct file_system_type sysv_fs_type = { .owner = THIS_MODULE, .name = "sysv", .mount = sysv_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("sysv"); static struct file_system_type v7_fs_type = { .owner = THIS_MODULE, .name = "v7", .mount = v7_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("v7"); MODULE_ALIAS("v7"); static int __init init_sysv_fs(void) { int error; error = sysv_init_icache(); if (error) goto out; error = register_filesystem(&sysv_fs_type); if (error) goto destroy_icache; error = register_filesystem(&v7_fs_type); if (error) goto unregister; return 0; unregister: unregister_filesystem(&sysv_fs_type); destroy_icache: sysv_destroy_icache(); out: return error; } static void __exit exit_sysv_fs(void) { unregister_filesystem(&sysv_fs_type); unregister_filesystem(&v7_fs_type); sysv_destroy_icache(); } module_init(init_sysv_fs) module_exit(exit_sysv_fs) MODULE_LICENSE("GPL"); |
2373 44 103 71 75 26 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H #include <linux/types.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm_types.h> #include <linux/srcu.h> struct mmu_notifier; struct mmu_notifier_ops; /* mmu_notifier_ops flags */ #define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01) #ifdef CONFIG_MMU_NOTIFIER /* * The mmu notifier_mm structure is allocated and installed in * mm->mmu_notifier_mm inside the mm_take_all_locks() protected * critical section and it's released only when mm_count reaches zero * in mmdrop(). */ struct mmu_notifier_mm { /* all mmu notifiers registerd in this mm are queued in this list */ struct hlist_head list; /* to serialize the list modifications and hlist_unhashed */ spinlock_t lock; }; struct mmu_notifier_ops { /* * Flags to specify behavior of callbacks for this MMU notifier. * Used to determine which context an operation may be called. * * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not * block */ int flags; /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are * freed. This can run concurrently with other mmu notifier * methods (the ones invoked outside the mm context) and it * should tear down all secondary mmu mappings and freeze the * secondary mmu. If this method isn't implemented you've to * be sure that nothing could possibly write to the pages * through the secondary mmu by the time the last thread with * tsk->mm == mm exits. * * As side note: the pages freed after ->release returns could * be immediately reallocated by the gart at an alias physical * address with a different cache model, so if ->release isn't * implemented because all _software_ driven memory accesses * through the secondary mmu are terminated by the time the * last thread of this mm quits, you've also to be sure that * speculative _hardware_ operations can't allocate dirty * cachelines in the cpu that could not be snooped and made * coherent with the other read and write operations happening * through the gart alias address, so leading to memory * corruption. */ void (*release)(struct mmu_notifier *mn, struct mm_struct *mm); /* * clear_flush_young is called after the VM is * test-and-clearing the young/accessed bitflag in the * pte. This way the VM will provide proper aging to the * accesses to the page through the secondary MMUs and not * only to the ones through the Linux pte. * Start-end is necessary in case the secondary MMU is mapping the page * at a smaller granularity than the primary MMU. */ int (*clear_flush_young)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); /* * clear_young is a lightweight version of clear_flush_young. Like the * latter, it is supposed to test-and-clear the young/accessed bitflag * in the secondary pte, but it may omit flushing the secondary tlb. */ int (*clear_young)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); /* * test_young is called to check the young/accessed bitflag in * the secondary pte. This is used to know if the page is * frequently used without actually clearing the flag or tearing * down the secondary mapping on the page. */ int (*test_young)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address); /* * change_pte is called in cases that pte mapping to page is changed: * for example, when ksm remaps pte to point to a new shared page. */ void (*change_pte)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long address, pte_t pte); /* * invalidate_range_start() and invalidate_range_end() must be * paired and are called only when the mmap_sem and/or the * locks protecting the reverse maps are held. If the subsystem * can't guarantee that no additional references are taken to * the pages in the range, it has to implement the * invalidate_range() notifier to remove any references taken * after invalidate_range_start(). * * Invalidation of multiple concurrent ranges may be * optionally permitted by the driver. Either way the * establishment of sptes is forbidden in the range passed to * invalidate_range_begin/end for the whole duration of the * invalidate_range_begin/end critical section. * * invalidate_range_start() is called when all pages in the * range are still mapped and have at least a refcount of one. * * invalidate_range_end() is called when all pages in the * range have been unmapped and the pages have been freed by * the VM. * * The VM will remove the page table entries and potentially * the page between invalidate_range_start() and * invalidate_range_end(). If the page must not be freed * because of pending I/O or other circumstances then the * invalidate_range_start() callback (or the initial mapping * by the driver) must make sure that the refcount is kept * elevated. * * If the driver increases the refcount when the pages are * initially mapped into an address space then either * invalidate_range_start() or invalidate_range_end() may * decrease the refcount. If the refcount is decreased on * invalidate_range_start() then the VM can free pages as page * table entries are removed. If the refcount is only * droppped on invalidate_range_end() then the driver itself * will drop the last refcount but it must take care to flush * any secondary tlb before doing the final free on the * page. Pages will no longer be referenced by the linux * address space but may still be referenced by sptes until * the last refcount is dropped. * * If blockable argument is set to false then the callback cannot * sleep and has to return with -EAGAIN. 0 should be returned * otherwise. * */ int (*invalidate_range_start)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end, bool blockable); void (*invalidate_range_end)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); /* * invalidate_range() is either called between * invalidate_range_start() and invalidate_range_end() when the * VM has to free pages that where unmapped, but before the * pages are actually freed, or outside of _start()/_end() when * a (remote) TLB is necessary. * * If invalidate_range() is used to manage a non-CPU TLB with * shared page-tables, it not necessary to implement the * invalidate_range_start()/end() notifiers, as * invalidate_range() alread catches the points in time when an * external TLB range needs to be flushed. For more in depth * discussion on this see Documentation/vm/mmu_notifier.rst * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if * called between those functions. * * If this callback cannot block, and invalidate_range_{start,end} * cannot block, mmu_notifier_ops.flags should have * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); }; /* * The notifier chains are protected by mmap_sem and/or the reverse map * semaphores. Notifier chains are only changed when all reverse maps and * the mmap_sem locks are taken. * * Therefore notifier chains can only be traversed when either * * 1. mmap_sem is held. * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). * 3. No other concurrent thread can access the list (release) */ struct mmu_notifier { struct hlist_node hlist; const struct mmu_notifier_ops *ops; }; static inline int mm_has_notifiers(struct mm_struct *mm) { return unlikely(mm->mmu_notifier_mm); } extern int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); extern int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); extern void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm); extern void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, struct mm_struct *mm); extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end, bool blockable); extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm); static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_release(mm); } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_flush_young(mm, start, end); return 0; } static inline int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_young(mm, start, end); return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { if (mm_has_notifiers(mm)) return __mmu_notifier_test_young(mm, address); return 0; } static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { if (mm_has_notifiers(mm)) __mmu_notifier_change_pte(mm, address, pte); } static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) __mmu_notifier_invalidate_range_start(mm, start, end, true); } static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_invalidate_range_start(mm, start, end, false); return 0; } static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) __mmu_notifier_invalidate_range_end(mm, start, end, false); } static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) __mmu_notifier_invalidate_range_end(mm, start, end, true); } static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) __mmu_notifier_invalidate_range(mm, start, end); } static inline void mmu_notifier_mm_init(struct mm_struct *mm) { mm->mmu_notifier_mm = NULL; } static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_mm_destroy(mm); } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PAGE_SIZE); \ __young; \ }) #define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PMD_SIZE); \ __young; \ }) #define ptep_clear_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PAGE_SIZE); \ __young; \ }) #define pmdp_clear_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PMD_SIZE); \ __young; \ }) #define ptep_clear_flush_notify(__vma, __address, __ptep) \ ({ \ unsigned long ___addr = __address & PAGE_MASK; \ struct mm_struct *___mm = (__vma)->vm_mm; \ pte_t ___pte; \ \ ___pte = ptep_clear_flush(__vma, __address, __ptep); \ mmu_notifier_invalidate_range(___mm, ___addr, \ ___addr + PAGE_SIZE); \ \ ___pte; \ }) #define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ struct mm_struct *___mm = (__vma)->vm_mm; \ pmd_t ___pmd; \ \ ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ mmu_notifier_invalidate_range(___mm, ___haddr, \ ___haddr + HPAGE_PMD_SIZE); \ \ ___pmd; \ }) #define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \ struct mm_struct *___mm = (__vma)->vm_mm; \ pud_t ___pud; \ \ ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \ mmu_notifier_invalidate_range(___mm, ___haddr, \ ___haddr + HPAGE_PUD_SIZE); \ \ ___pud; \ }) /* * set_pte_at_notify() sets the pte _after_ running the notifier. * This is safe to start by updating the secondary MMUs, because the primary MMU * pte invalidate must have already happened with a ptep_clear_flush() before * set_pte_at_notify() has been invoked. Updating the secondary MMUs first is * required when we change both the protection of the mapping from read-only to * read-write and the pfn (like during copy on write page faults). Otherwise the * old page would remain mapped readonly in the secondary MMUs after the new * page is already writable by some CPU through the primary MMU. */ #define set_pte_at_notify(__mm, __address, __ptep, __pte) \ ({ \ struct mm_struct *___mm = __mm; \ unsigned long ___address = __address; \ pte_t ___pte = __pte; \ \ mmu_notifier_change_pte(___mm, ___address, ___pte); \ set_pte_at(___mm, ___address, __ptep, ___pte); \ }) extern void mmu_notifier_call_srcu(struct rcu_head *rcu, void (*func)(struct rcu_head *rcu)); extern void mmu_notifier_synchronize(void); #else /* CONFIG_MMU_NOTIFIER */ static inline int mm_has_notifiers(struct mm_struct *mm) { return 0; } static inline void mmu_notifier_release(struct mm_struct *mm) { } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { return 0; } static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { } static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { } static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm, unsigned long start, unsigned long end) { return 0; } static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end) { } static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm, unsigned long start, unsigned long end) { } static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { } static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) { return false; } static inline void mmu_notifier_mm_init(struct mm_struct *mm) { } static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) { } #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young #define pmdp_clear_young_notify pmdp_test_and_clear_young #define ptep_clear_flush_notify ptep_clear_flush #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush #define pudp_huge_clear_flush_notify pudp_huge_clear_flush #define set_pte_at_notify set_pte_at #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */ |
34 30 3 29 54 26 53 54 47 41 54 54 54 54 54 54 54 48 54 4 4 54 33 33 6 1 6 9 54 54 53 8 5 3 3 5 5 2 5 1 4 1 32 54 54 51 26 24 2 20 6 6 4 6 6 5 36 36 36 36 36 36 36 36 9 36 36 35 35 36 36 35 27 35 35 35 96 97 36 97 97 53 53 53 32 32 1 25 32 53 34 45 45 30 53 53 34 34 34 34 97 57 57 57 97 97 57 34 34 97 25 25 25 25 25 57 57 57 57 57 57 97 76 76 76 76 76 57 57 29 57 57 57 56 34 57 57 57 97 8 97 97 97 7 31 97 97 97 36 97 30 30 97 96 36 35 36 36 36 36 36 36 36 35 36 35 36 34 3 1 3 3 1 3 26 26 22 97 47 32 32 32 26 25 32 31 31 97 97 97 93 93 56 57 57 96 26 26 26 26 26 96 26 26 26 26 26 96 97 97 97 97 97 48 97 97 97 97 96 26 97 26 26 26 26 97 26 26 26 26 26 26 97 97 26 97 53 97 97 53 53 53 52 45 30 22 22 51 34 96 1 1 1 1 1 1 1 1 96 97 97 97 96 97 97 97 97 97 97 97 97 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 | /* * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ #include <linux/time.h> #include <linux/slab.h> #include <linux/string.h> #include "reiserfs.h" #include <linux/buffer_head.h> /* * To make any changes in the tree we find a node that contains item * to be changed/deleted or position in the node we insert a new item * to. We call this node S. To do balancing we need to decide what we * will shift to left/right neighbor, or to a new node, where new item * will be etc. To make this analysis simpler we build virtual * node. Virtual node is an array of items, that will replace items of * node S. (For instance if we are going to delete an item, virtual * node does not contain it). Virtual node keeps information about * item sizes and types, mergeability of first and last items, sizes * of all entries in directory item. We use this array of items when * calculating what we can shift to neighbors and how many nodes we * have to have if we do not any shiftings, if we shift to left/right * neighbor or to both. */ /* * Takes item number in virtual node, returns number of item * that it has in source buffer */ static inline int old_item_num(int new_num, int affected_item_num, int mode) { if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) return new_num; if (mode == M_INSERT) { RFALSE(new_num == 0, "vs-8005: for INSERT mode and item number of inserted item"); return new_num - 1; } RFALSE(mode != M_DELETE, "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", mode); /* delete mode */ return new_num + 1; } static void create_virtual_node(struct tree_balance *tb, int h) { struct item_head *ih; struct virtual_node *vn = tb->tb_vn; int new_num; struct buffer_head *Sh; /* this comes from tb->S[h] */ Sh = PATH_H_PBUFFER(tb->tb_path, h); /* size of changed node */ vn->vn_size = MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h]; /* for internal nodes array if virtual items is not created */ if (h) { vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE); return; } /* number of items in virtual node */ vn->vn_nr_item = B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) - ((vn->vn_mode == M_DELETE) ? 1 : 0); /* first virtual item */ vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1); memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item)); vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item); /* first item in the node */ ih = item_head(Sh, 0); /* define the mergeability for 0-th item (if it is not being deleted) */ if (op_is_left_mergeable(&ih->ih_key, Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; /* * go through all items that remain in the virtual * node (except for the new (inserted) one) */ for (new_num = 0; new_num < vn->vn_nr_item; new_num++) { int j; struct virtual_item *vi = vn->vn_vi + new_num; int is_affected = ((new_num != vn->vn_affected_item_num) ? 0 : 1); if (is_affected && vn->vn_mode == M_INSERT) continue; /* get item number in source node */ j = old_item_num(new_num, vn->vn_affected_item_num, vn->vn_mode); vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE; vi->vi_ih = ih + j; vi->vi_item = ih_item_body(Sh, ih + j); vi->vi_uarea = vn->vn_free_ptr; /* * FIXME: there is no check that item operation did not * consume too much memory */ vn->vn_free_ptr += op_create_vi(vn, vi, is_affected, tb->insert_size[0]); if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) reiserfs_panic(tb->tb_sb, "vs-8030", "virtual node space consumed"); if (!is_affected) /* this is not being changed */ continue; if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; /* pointer to data which is going to be pasted */ vi->vi_new_data = vn->vn_data; } } /* virtual inserted item is not defined yet */ if (vn->vn_mode == M_INSERT) { struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num; RFALSE(vn->vn_ins_ih == NULL, "vs-8040: item header of inserted item is not specified"); vi->vi_item_len = tb->insert_size[0]; vi->vi_ih = vn->vn_ins_ih; vi->vi_item = vn->vn_data; vi->vi_uarea = vn->vn_free_ptr; op_create_vi(vn, vi, 0 /*not pasted or cut */ , tb->insert_size[0]); } /* * set right merge flag we take right delimiting key and * check whether it is a mergeable item */ if (tb->CFR[0]) { struct reiserfs_key *key; key = internal_key(tb->CFR[0], tb->rkey[0]); if (op_is_left_mergeable(key, Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) vn->vn_vi[vn->vn_nr_item - 1].vi_type |= VI_TYPE_RIGHT_MERGEABLE; #ifdef CONFIG_REISERFS_CHECK if (op_is_left_mergeable(key, Sh->b_size) && !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) { /* * we delete last item and it could be merged * with right neighbor's first item */ if (! (B_NR_ITEMS(Sh) == 1 && is_direntry_le_ih(item_head(Sh, 0)) && ih_entry_count(item_head(Sh, 0)) == 1)) { /* * node contains more than 1 item, or item * is not directory item, or this item * contains more than 1 entry */ print_block(Sh, 0, -1, -1); reiserfs_panic(tb->tb_sb, "vs-8045", "rdkey %k, affected item==%d " "(mode==%c) Must be %c", key, vn->vn_affected_item_num, vn->vn_mode, M_DELETE); } } #endif } } /* * Using virtual node check, how many items can be * shifted to left neighbor */ static void check_left(struct tree_balance *tb, int h, int cur_free) { int i; struct virtual_node *vn = tb->tb_vn; struct virtual_item *vi; int d_size, ih_size; RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free); /* internal level */ if (h > 0) { tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE); return; } /* leaf level */ if (!cur_free || !vn->vn_nr_item) { /* no free space or nothing to move */ tb->lnum[h] = 0; tb->lbytes = -1; return; } RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), "vs-8055: parent does not exist or invalid"); vi = vn->vn_vi; if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) { /* all contents of S[0] fits into L[0] */ RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, "vs-8055: invalid mode or balance condition failed"); tb->lnum[0] = vn->vn_nr_item; tb->lbytes = -1; return; } d_size = 0, ih_size = IH_SIZE; /* first item may be merge with last item in left neighbor */ if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE) d_size = -((int)IH_SIZE), ih_size = 0; tb->lnum[0] = 0; for (i = 0; i < vn->vn_nr_item; i++, ih_size = IH_SIZE, d_size = 0, vi++) { d_size += vi->vi_item_len; if (cur_free >= d_size) { /* the item can be shifted entirely */ cur_free -= d_size; tb->lnum[0]++; continue; } /* the item cannot be shifted entirely, try to split it */ /* * check whether L[0] can hold ih and at least one byte * of the item body */ /* cannot shift even a part of the current item */ if (cur_free <= ih_size) { tb->lbytes = -1; return; } cur_free -= ih_size; tb->lbytes = op_check_left(vi, cur_free, 0, 0); if (tb->lbytes != -1) /* count partially shifted item */ tb->lnum[0]++; break; } return; } /* * Using virtual node check, how many items can be * shifted to right neighbor */ static void check_right(struct tree_balance *tb, int h, int cur_free) { int i; struct virtual_node *vn = tb->tb_vn; struct virtual_item *vi; int d_size, ih_size; RFALSE(cur_free < 0, "vs-8070: cur_free < 0"); /* internal level */ if (h > 0) { tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE); return; } /* leaf level */ if (!cur_free || !vn->vn_nr_item) { /* no free space */ tb->rnum[h] = 0; tb->rbytes = -1; return; } RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), "vs-8075: parent does not exist or invalid"); vi = vn->vn_vi + vn->vn_nr_item - 1; if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) { /* all contents of S[0] fits into R[0] */ RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, "vs-8080: invalid mode or balance condition failed"); tb->rnum[h] = vn->vn_nr_item; tb->rbytes = -1; return; } d_size = 0, ih_size = IH_SIZE; /* last item may be merge with first item in right neighbor */ if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) d_size = -(int)IH_SIZE, ih_size = 0; tb->rnum[0] = 0; for (i = vn->vn_nr_item - 1; i >= 0; i--, d_size = 0, ih_size = IH_SIZE, vi--) { d_size += vi->vi_item_len; if (cur_free >= d_size) { /* the item can be shifted entirely */ cur_free -= d_size; tb->rnum[0]++; continue; } /* * check whether R[0] can hold ih and at least one * byte of the item body */ /* cannot shift even a part of the current item */ if (cur_free <= ih_size) { tb->rbytes = -1; return; } /* * R[0] can hold the header of the item and at least * one byte of its body */ cur_free -= ih_size; /* cur_free is still > 0 */ tb->rbytes = op_check_right(vi, cur_free); if (tb->rbytes != -1) /* count partially shifted item */ tb->rnum[0]++; break; } return; } /* * from - number of items, which are shifted to left neighbor entirely * to - number of item, which are shifted to right neighbor entirely * from_bytes - number of bytes of boundary item (or directory entries) * which are shifted to left neighbor * to_bytes - number of bytes of boundary item (or directory entries) * which are shifted to right neighbor */ static int get_num_ver(int mode, struct tree_balance *tb, int h, int from, int from_bytes, int to, int to_bytes, short *snum012, int flow) { int i; int cur_free; int units; struct virtual_node *vn = tb->tb_vn; int total_node_size, max_node_size, current_item_size; int needed_nodes; /* position of item we start filling node from */ int start_item; /* position of item we finish filling node by */ int end_item; /* * number of first bytes (entries for directory) of start_item-th item * we do not include into node that is being filled */ int start_bytes; /* * number of last bytes (entries for directory) of end_item-th item * we do node include into node that is being filled */ int end_bytes; /* * these are positions in virtual item of items, that are split * between S[0] and S1new and S1new and S2new */ int split_item_positions[2]; split_item_positions[0] = -1; split_item_positions[1] = -1; /* * We only create additional nodes if we are in insert or paste mode * or we are in replace mode at the internal level. If h is 0 and * the mode is M_REPLACE then in fix_nodes we change the mode to * paste or insert before we get here in the code. */ RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE), "vs-8100: insert_size < 0 in overflow"); max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h)); /* * snum012 [0-2] - number of items, that lay * to S[0], first new node and second new node */ snum012[3] = -1; /* s1bytes */ snum012[4] = -1; /* s2bytes */ /* internal level */ if (h > 0) { i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE); if (i == max_node_size) return 1; return (i / max_node_size + 1); } /* leaf level */ needed_nodes = 1; total_node_size = 0; cur_free = max_node_size; /* start from 'from'-th item */ start_item = from; /* skip its first 'start_bytes' units */ start_bytes = ((from_bytes != -1) ? from_bytes : 0); /* last included item is the 'end_item'-th one */ end_item = vn->vn_nr_item - to - 1; /* do not count last 'end_bytes' units of 'end_item'-th item */ end_bytes = (to_bytes != -1) ? to_bytes : 0; /* * go through all item beginning from the start_item-th item * and ending by the end_item-th item. Do not count first * 'start_bytes' units of 'start_item'-th item and last * 'end_bytes' of 'end_item'-th item */ for (i = start_item; i <= end_item; i++) { struct virtual_item *vi = vn->vn_vi + i; int skip_from_end = ((i == end_item) ? end_bytes : 0); RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed"); /* get size of current item */ current_item_size = vi->vi_item_len; /* * do not take in calculation head part (from_bytes) * of from-th item */ current_item_size -= op_part_size(vi, 0 /*from start */ , start_bytes); /* do not take in calculation tail part of last item */ current_item_size -= op_part_size(vi, 1 /*from end */ , skip_from_end); /* if item fits into current node entierly */ if (total_node_size + current_item_size <= max_node_size) { snum012[needed_nodes - 1]++; total_node_size += current_item_size; start_bytes = 0; continue; } /* * virtual item length is longer, than max size of item in * a node. It is impossible for direct item */ if (current_item_size > max_node_size) { RFALSE(is_direct_le_ih(vi->vi_ih), "vs-8110: " "direct item length is %d. It can not be longer than %d", current_item_size, max_node_size); /* we will try to split it */ flow = 1; } /* as we do not split items, take new node and continue */ if (!flow) { needed_nodes++; i--; total_node_size = 0; continue; } /* * calculate number of item units which fit into node being * filled */ { int free_space; free_space = max_node_size - total_node_size - IH_SIZE; units = op_check_left(vi, free_space, start_bytes, skip_from_end); /* * nothing fits into current node, take new * node and continue */ if (units == -1) { needed_nodes++, i--, total_node_size = 0; continue; } } /* something fits into the current node */ start_bytes += units; snum012[needed_nodes - 1 + 3] = units; if (needed_nodes > 2) reiserfs_warning(tb->tb_sb, "vs-8111", "split_item_position is out of range"); snum012[needed_nodes - 1]++; split_item_positions[needed_nodes - 1] = i; needed_nodes++; /* continue from the same item with start_bytes != -1 */ start_item = i; i--; total_node_size = 0; } /* * sum012[4] (if it is not -1) contains number of units of which * are to be in S1new, snum012[3] - to be in S0. They are supposed * to be S1bytes and S2bytes correspondingly, so recalculate */ if (snum012[4] > 0) { int split_item_num; int bytes_to_r, bytes_to_l; int bytes_to_S1new; split_item_num = split_item_positions[1]; bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0); bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0); bytes_to_S1new = ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0); /* s2bytes */ snum012[4] = op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new; if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY && vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT) reiserfs_warning(tb->tb_sb, "vs-8115", "not directory or indirect item"); } /* now we know S2bytes, calculate S1bytes */ if (snum012[3] > 0) { int split_item_num; int bytes_to_r, bytes_to_l; int bytes_to_S2new; split_item_num = split_item_positions[0]; bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0); bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0); bytes_to_S2new = ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0); /* s1bytes */ snum012[3] = op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new; } return needed_nodes; } /* * Set parameters for balancing. * Performs write of results of analysis of balancing into structure tb, * where it will later be used by the functions that actually do the balancing. * Parameters: * tb tree_balance structure; * h current level of the node; * lnum number of items from S[h] that must be shifted to L[h]; * rnum number of items from S[h] that must be shifted to R[h]; * blk_num number of blocks that S[h] will be splitted into; * s012 number of items that fall into splitted nodes. * lbytes number of bytes which flow to the left neighbor from the * item that is not not shifted entirely * rbytes number of bytes which flow to the right neighbor from the * item that is not not shifted entirely * s1bytes number of bytes which flow to the first new node when * S[0] splits (this number is contained in s012 array) */ static void set_parameters(struct tree_balance *tb, int h, int lnum, int rnum, int blk_num, short *s012, int lb, int rb) { tb->lnum[h] = lnum; tb->rnum[h] = rnum; tb->blknum[h] = blk_num; /* only for leaf level */ if (h == 0) { if (s012 != NULL) { tb->s0num = *s012++; tb->snum[0] = *s012++; tb->snum[1] = *s012++; tb->sbytes[0] = *s012++; tb->sbytes[1] = *s012; } tb->lbytes = lb; tb->rbytes = rb; } PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum); PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum); PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb); PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb); } /* * check if node disappears if we shift tb->lnum[0] items to left * neighbor and tb->rnum[0] to the right one. */ static int is_leaf_removable(struct tree_balance *tb) { struct virtual_node *vn = tb->tb_vn; int to_left, to_right; int size; int remain_items; /* * number of items that will be shifted to left (right) neighbor * entirely */ to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); remain_items = vn->vn_nr_item; /* how many items remain in S[0] after shiftings to neighbors */ remain_items -= (to_left + to_right); /* all content of node can be shifted to neighbors */ if (remain_items < 1) { set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1); return 1; } /* S[0] is not removable */ if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) return 0; /* check whether we can divide 1 remaining item between neighbors */ /* get size of remaining item (in item units) */ size = op_unit_num(&vn->vn_vi[to_left]); if (tb->lbytes + tb->rbytes >= size) { set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL, tb->lbytes, -1); return 1; } return 0; } /* check whether L, S, R can be joined in one node */ static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree) { struct virtual_node *vn = tb->tb_vn; int ih_size; struct buffer_head *S0; S0 = PATH_H_PBUFFER(tb->tb_path, 0); ih_size = 0; if (vn->vn_nr_item) { if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE) ih_size += IH_SIZE; if (vn->vn_vi[vn->vn_nr_item - 1]. vi_type & VI_TYPE_RIGHT_MERGEABLE) ih_size += IH_SIZE; } else { /* there was only one item and it will be deleted */ struct item_head *ih; RFALSE(B_NR_ITEMS(S0) != 1, "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0)); ih = item_head(S0, 0); if (tb->CFR[0] && !comp_short_le_keys(&ih->ih_key, internal_key(tb->CFR[0], tb->rkey[0]))) /* * Directory must be in correct state here: that is * somewhere at the left side should exist first * directory item. But the item being deleted can * not be that first one because its right neighbor * is item of the same directory. (But first item * always gets deleted in last turn). So, neighbors * of deleted item can be merged, so we can save * ih_size */ if (is_direntry_le_ih(ih)) { ih_size = IH_SIZE; /* * we might check that left neighbor exists * and is of the same directory */ RFALSE(le_ih_k_offset(ih) == DOT_OFFSET, "vs-8130: first directory item can not be removed until directory is not empty"); } } if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) { set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1); PROC_INFO_INC(tb->tb_sb, leaves_removable); return 1; } return 0; } /* when we do not split item, lnum and rnum are numbers of entire items */ #define SET_PAR_SHIFT_LEFT \ if (h)\ {\ int to_l;\ \ to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\ (MAX_NR_KEY(Sh) + 1 - lpar);\ \ set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\ }\ else \ {\ if (lset==LEFT_SHIFT_FLOW)\ set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\ tb->lbytes, -1);\ else\ set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\ -1, -1);\ } #define SET_PAR_SHIFT_RIGHT \ if (h)\ {\ int to_r;\ \ to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\ \ set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\ }\ else \ {\ if (rset==RIGHT_SHIFT_FLOW)\ set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\ -1, tb->rbytes);\ else\ set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\ -1, -1);\ } static void free_buffers_in_tb(struct tree_balance *tb) { int i; pathrelse(tb->tb_path); for (i = 0; i < MAX_HEIGHT; i++) { brelse(tb->L[i]); brelse(tb->R[i]); brelse(tb->FL[i]); brelse(tb->FR[i]); brelse(tb->CFL[i]); brelse(tb->CFR[i]); tb->L[i] = NULL; tb->R[i] = NULL; tb->FL[i] = NULL; tb->FR[i] = NULL; tb->CFL[i] = NULL; tb->CFR[i] = NULL; } } /* * Get new buffers for storing new nodes that are created while balancing. * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; * CARRY_ON - schedule didn't occur while the function worked; * NO_DISK_SPACE - no disk space. */ /* The function is NOT SCHEDULE-SAFE! */ static int get_empty_nodes(struct tree_balance *tb, int h) { struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h); b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, }; int counter, number_of_freeblk; int amount_needed; /* number of needed empty blocks */ int retval = CARRY_ON; struct super_block *sb = tb->tb_sb; /* * number_of_freeblk is the number of empty blocks which have been * acquired for use by the balancing algorithm minus the number of * empty blocks used in the previous levels of the analysis, * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule * occurs after empty blocks are acquired, and the balancing analysis * is then restarted, amount_needed is the number needed by this * level (h) of the balancing analysis. * * Note that for systems with many processes writing, it would be * more layout optimal to calculate the total number needed by all * levels and then to run reiserfs_new_blocks to get all of them at * once. */ /* * Initiate number_of_freeblk to the amount acquired prior to the * restart of the analysis or 0 if not restarted, then subtract the * amount needed by all of the levels of the tree below h. */ /* blknum includes S[h], so we subtract 1 in this calculation */ for (counter = 0, number_of_freeblk = tb->cur_blknum; counter < h; counter++) number_of_freeblk -= (tb->blknum[counter]) ? (tb->blknum[counter] - 1) : 0; /* Allocate missing empty blocks. */ /* if Sh == 0 then we are getting a new root */ amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1; /* * Amount_needed = the amount that we need more than the * amount that we have. */ if (amount_needed > number_of_freeblk) amount_needed -= number_of_freeblk; else /* If we have enough already then there is nothing to do. */ return CARRY_ON; /* * No need to check quota - is not allocated for blocks used * for formatted nodes */ if (reiserfs_new_form_blocknrs(tb, blocknrs, amount_needed) == NO_DISK_SPACE) return NO_DISK_SPACE; /* for each blocknumber we just got, get a buffer and stick it on FEB */ for (blocknr = blocknrs, counter = 0; counter < amount_needed; blocknr++, counter++) { RFALSE(!*blocknr, "PAP-8135: reiserfs_new_blocknrs failed when got new blocks"); new_bh = sb_getblk(sb, *blocknr); RFALSE(buffer_dirty(new_bh) || buffer_journaled(new_bh) || buffer_journal_dirty(new_bh), "PAP-8140: journaled or dirty buffer %b for the new block", new_bh); /* Put empty buffers into the array. */ RFALSE(tb->FEB[tb->cur_blknum], "PAP-8141: busy slot for new buffer"); set_buffer_journal_new(new_bh); tb->FEB[tb->cur_blknum++] = new_bh; } if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb)) retval = REPEAT_SEARCH; return retval; } /* * Get free space of the left neighbor, which is stored in the parent * node of the left neighbor. */ static int get_lfree(struct tree_balance *tb, int h) { struct buffer_head *l, *f; int order; if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL || (l = tb->FL[h]) == NULL) return 0; if (f == l) order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1; else { order = B_NR_ITEMS(l); f = l; } return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); } /* * Get free space of the right neighbor, * which is stored in the parent node of the right neighbor. */ static int get_rfree(struct tree_balance *tb, int h) { struct buffer_head *r, *f; int order; if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL || (r = tb->FR[h]) == NULL) return 0; if (f == r) order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1; else { order = 0; f = r; } return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); } /* Check whether left neighbor is in memory. */ static int is_left_neighbor_in_cache(struct tree_balance *tb, int h) { struct buffer_head *father, *left; struct super_block *sb = tb->tb_sb; b_blocknr_t left_neighbor_blocknr; int left_neighbor_position; /* Father of the left neighbor does not exist. */ if (!tb->FL[h]) return 0; /* Calculate father of the node to be balanced. */ father = PATH_H_PBUFFER(tb->tb_path, h + 1); RFALSE(!father || !B_IS_IN_TREE(father) || !B_IS_IN_TREE(tb->FL[h]) || !buffer_uptodate(father) || !buffer_uptodate(tb->FL[h]), "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", father, tb->FL[h]); /* * Get position of the pointer to the left neighbor * into the left father. */ left_neighbor_position = (father == tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->FL[h]); /* Get left neighbor block number. */ left_neighbor_blocknr = B_N_CHILD_NUM(tb->FL[h], left_neighbor_position); /* Look for the left neighbor in the cache. */ if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) { RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left), "vs-8170: left neighbor (%b %z) is not in the tree", left, left); put_bh(left); return 1; } return 0; } #define LEFT_PARENTS 'l' #define RIGHT_PARENTS 'r' static void decrement_key(struct cpu_key *key) { /* call item specific function for this key */ item_ops[cpu_key_k_type(key)]->decrement_key(key); } /* * Calculate far left/right parent of the left/right neighbor of the * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor * of the parent F[h]. * Calculate left/right common parent of the current node and L[h]/R[h]. * Calculate left/right delimiting key position. * Returns: PATH_INCORRECT - path in the tree is not correct * SCHEDULE_OCCURRED - schedule occurred while the function worked * CARRY_ON - schedule didn't occur while the function * worked */ static int get_far_parent(struct tree_balance *tb, int h, struct buffer_head **pfather, struct buffer_head **pcom_father, char c_lr_par) { struct buffer_head *parent; INITIALIZE_PATH(s_path_to_neighbor_father); struct treepath *path = tb->tb_path; struct cpu_key s_lr_father_key; int counter, position = INT_MAX, first_last_position = 0, path_offset = PATH_H_PATH_OFFSET(path, h); /* * Starting from F[h] go upwards in the tree, and look for the common * ancestor of F[h], and its neighbor l/r, that should be obtained. */ counter = path_offset; RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET, "PAP-8180: invalid path length"); for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) { /* * Check whether parent of the current buffer in the path * is really parent in the tree. */ if (!B_IS_IN_TREE (parent = PATH_OFFSET_PBUFFER(path, counter - 1))) return REPEAT_SEARCH; /* Check whether position in the parent is correct. */ if ((position = PATH_OFFSET_POSITION(path, counter - 1)) > B_NR_ITEMS(parent)) return REPEAT_SEARCH; /* * Check whether parent at the path really points * to the child. */ if (B_N_CHILD_NUM(parent, position) != PATH_OFFSET_PBUFFER(path, counter)->b_blocknr) return REPEAT_SEARCH; /* * Return delimiting key if position in the parent is not * equal to first/last one. */ if (c_lr_par == RIGHT_PARENTS) first_last_position = B_NR_ITEMS(parent); if (position != first_last_position) { *pcom_father = parent; get_bh(*pcom_father); /*(*pcom_father = parent)->b_count++; */ break; } } /* if we are in the root of the tree, then there is no common father */ if (counter == FIRST_PATH_ELEMENT_OFFSET) { /* * Check whether first buffer in the path is the * root of the tree. */ if (PATH_OFFSET_PBUFFER (tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) { *pfather = *pcom_father = NULL; return CARRY_ON; } return REPEAT_SEARCH; } RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL, "PAP-8185: (%b %z) level too small", *pcom_father, *pcom_father); /* Check whether the common parent is locked. */ if (buffer_locked(*pcom_father)) { /* Release the write lock while the buffer is busy */ int depth = reiserfs_write_unlock_nested(tb->tb_sb); __wait_on_buffer(*pcom_father); reiserfs_write_lock_nested(tb->tb_sb, depth); if (FILESYSTEM_CHANGED_TB(tb)) { brelse(*pcom_father); return REPEAT_SEARCH; } } /* * So, we got common parent of the current node and its * left/right neighbor. Now we are getting the parent of the * left/right neighbor. */ /* Form key to get parent of the left/right neighbor. */ le_key2cpu_key(&s_lr_father_key, internal_key(*pcom_father, (c_lr_par == LEFT_PARENTS) ? (tb->lkey[h - 1] = position - 1) : (tb->rkey[h - 1] = position))); if (c_lr_par == LEFT_PARENTS) decrement_key(&s_lr_father_key); if (search_by_key (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, h + 1) == IO_ERROR) /* path is released */ return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { pathrelse(&s_path_to_neighbor_father); brelse(*pcom_father); return REPEAT_SEARCH; } *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father); RFALSE(B_LEVEL(*pfather) != h + 1, "PAP-8190: (%b %z) level too small", *pfather, *pfather); RFALSE(s_path_to_neighbor_father.path_length < FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small"); s_path_to_neighbor_father.path_length--; pathrelse(&s_path_to_neighbor_father); return CARRY_ON; } /* * Get parents of neighbors of node in the path(S[path_offset]) and * common parents of S[path_offset] and L[path_offset]/R[path_offset]: * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset], * CFR[path_offset]. * Calculate numbers of left and right delimiting keys position: * lkey[path_offset], rkey[path_offset]. * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked * CARRY_ON - schedule didn't occur while the function worked */ static int get_parents(struct tree_balance *tb, int h) { struct treepath *path = tb->tb_path; int position, ret, path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h); struct buffer_head *curf, *curcf; /* Current node is the root of the tree or will be root of the tree */ if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { /* * The root can not have parents. * Release nodes which previously were obtained as * parents of the current node neighbors. */ brelse(tb->FL[h]); brelse(tb->CFL[h]); brelse(tb->FR[h]); brelse(tb->CFR[h]); tb->FL[h] = NULL; tb->CFL[h] = NULL; tb->FR[h] = NULL; tb->CFR[h] = NULL; return CARRY_ON; } /* Get parent FL[path_offset] of L[path_offset]. */ position = PATH_OFFSET_POSITION(path, path_offset - 1); if (position) { /* Current node is not the first child of its parent. */ curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); get_bh(curf); get_bh(curf); tb->lkey[h] = position - 1; } else { /* * Calculate current parent of L[path_offset], which is the * left neighbor of the current node. Calculate current * common parent of L[path_offset] and the current node. * Note that CFL[path_offset] not equal FL[path_offset] and * CFL[path_offset] not equal F[path_offset]. * Calculate lkey[path_offset]. */ if ((ret = get_far_parent(tb, h + 1, &curf, &curcf, LEFT_PARENTS)) != CARRY_ON) return ret; } brelse(tb->FL[h]); tb->FL[h] = curf; /* New initialization of FL[h]. */ brelse(tb->CFL[h]); tb->CFL[h] = curcf; /* New initialization of CFL[h]. */ RFALSE((curf && !B_IS_IN_TREE(curf)) || (curcf && !B_IS_IN_TREE(curcf)), "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf); /* Get parent FR[h] of R[h]. */ /* Current node is the last child of F[h]. FR[h] != F[h]. */ if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) { /* * Calculate current parent of R[h], which is the right * neighbor of F[h]. Calculate current common parent of * R[h] and current node. Note that CFR[h] not equal * FR[path_offset] and CFR[h] not equal F[h]. */ if ((ret = get_far_parent(tb, h + 1, &curf, &curcf, RIGHT_PARENTS)) != CARRY_ON) return ret; } else { /* Current node is not the last child of its parent F[h]. */ curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); get_bh(curf); get_bh(curf); tb->rkey[h] = position; } brelse(tb->FR[h]); /* New initialization of FR[path_offset]. */ tb->FR[h] = curf; brelse(tb->CFR[h]); /* New initialization of CFR[path_offset]. */ tb->CFR[h] = curcf; RFALSE((curf && !B_IS_IN_TREE(curf)) || (curcf && !B_IS_IN_TREE(curcf)), "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf); return CARRY_ON; } /* * it is possible to remove node as result of shiftings to * neighbors even when we insert or paste item. */ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, struct tree_balance *tb, int h) { struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h); int levbytes = tb->insert_size[h]; struct item_head *ih; struct reiserfs_key *r_key = NULL; ih = item_head(Sh, 0); if (tb->CFR[h]) r_key = internal_key(tb->CFR[h], tb->rkey[h]); if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes /* shifting may merge items which might save space */ - ((!h && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0) - ((!h && r_key && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0) + ((h) ? KEY_SIZE : 0)) { /* node can not be removed */ if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ if (!h) tb->s0num = B_NR_ITEMS(Sh) + ((mode == M_INSERT) ? 1 : 0); set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); return NO_BALANCING_NEEDED; } } PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]); return !NO_BALANCING_NEEDED; } /* * Check whether current node S[h] is balanced when increasing its size by * Inserting or Pasting. * Calculate parameters for balancing for current level h. * Parameters: * tb tree_balance structure; * h current level of the node; * inum item number in S[h]; * mode i - insert, p - paste; * Returns: 1 - schedule occurred; * 0 - balancing for higher levels needed; * -1 - no balancing for higher levels needed; * -2 - no disk space. */ /* ip means Inserting or Pasting */ static int ip_check_balance(struct tree_balance *tb, int h) { struct virtual_node *vn = tb->tb_vn; /* * Number of bytes that must be inserted into (value is negative * if bytes are deleted) buffer which contains node being balanced. * The mnemonic is that the attempted change in node space used * level is levbytes bytes. */ int levbytes; int ret; int lfree, sfree, rfree /* free space in L, S and R */ ; /* * nver is short for number of vertixes, and lnver is the number if * we shift to the left, rnver is the number if we shift to the * right, and lrnver is the number if we shift in both directions. * The goal is to minimize first the number of vertixes, and second, * the number of vertixes whose contents are changed by shifting, * and third the number of uncached vertixes whose contents are * changed by shifting and must be read from disk. */ int nver, lnver, rnver, lrnver; /* * used at leaf level only, S0 = S[0] is the node being balanced, * sInum [ I = 0,1,2 ] is the number of items that will * remain in node SI after balancing. S1 and S2 are new * nodes that might be created. */ /* * we perform 8 calls to get_num_ver(). For each call we * calculate five parameters. where 4th parameter is s1bytes * and 5th - s2bytes * * s0num, s1num, s2num for 8 cases * 0,1 - do not shift and do not shift but bottle * 2 - shift only whole item to left * 3 - shift to left and bottle as much as possible * 4,5 - shift to right (whole items and as much as possible * 6,7 - shift to both directions (whole items and as much as possible) */ short snum012[40] = { 0, }; /* Sh is the node whose balance is currently being checked */ struct buffer_head *Sh; Sh = PATH_H_PBUFFER(tb->tb_path, h); levbytes = tb->insert_size[h]; /* Calculate balance parameters for creating new root. */ if (!Sh) { if (!h) reiserfs_panic(tb->tb_sb, "vs-8210", "S[0] can not be 0"); switch (ret = get_empty_nodes(tb, h)) { /* no balancing for higher levels needed */ case CARRY_ON: set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); return NO_BALANCING_NEEDED; case NO_DISK_SPACE: case REPEAT_SEARCH: return ret; default: reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect " "return value of get_empty_nodes"); } } /* get parents of S[h] neighbors. */ ret = get_parents(tb, h); if (ret != CARRY_ON) return ret; sfree = B_FREE_SPACE(Sh); /* get free space of neighbors */ rfree = get_rfree(tb, h); lfree = get_lfree(tb, h); /* and new item fits into node S[h] without any shifting */ if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED) return NO_BALANCING_NEEDED; create_virtual_node(tb, h); /* * determine maximal number of items we can shift to the left * neighbor (in tb structure) and the maximal number of bytes * that can flow to the left neighbor from the left most liquid * item that cannot be shifted from S[0] entirely (returned value) */ check_left(tb, h, lfree); /* * determine maximal number of items we can shift to the right * neighbor (in tb structure) and the maximal number of bytes * that can flow to the right neighbor from the right most liquid * item that cannot be shifted from S[0] entirely (returned value) */ check_right(tb, h, rfree); /* * all contents of internal node S[h] can be moved into its * neighbors, S[h] will be removed after balancing */ if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { int to_r; /* * Since we are working on internal nodes, and our internal * nodes have fixed size entries, then we can balance by the * number of items rather than the space they consume. In this * routine we set the left node equal to the right node, * allowing a difference of less than or equal to 1 child * pointer. */ to_r = ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); return CARRY_ON; } /* * this checks balance condition, that any two neighboring nodes * can not fit in one node */ RFALSE(h && (tb->lnum[h] >= vn->vn_nr_item + 1 || tb->rnum[h] >= vn->vn_nr_item + 1), "vs-8220: tree is not balanced on internal level"); RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) || (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))), "vs-8225: tree is not balanced on leaf level"); /* * all contents of S[0] can be moved into its neighbors * S[0] will be removed after balancing. */ if (!h && is_leaf_removable(tb)) return CARRY_ON; /* * why do we perform this check here rather than earlier?? * Answer: we can win 1 node in some cases above. Moreover we * checked it above, when we checked, that S[0] is not removable * in principle */ /* new item fits into node S[h] without any shifting */ if (sfree >= levbytes) { if (!h) tb->s0num = vn->vn_nr_item; set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); return NO_BALANCING_NEEDED; } { int lpar, rpar, nset, lset, rset, lrset; /* regular overflowing of the node */ /* * get_num_ver works in 2 modes (FLOW & NO_FLOW) * lpar, rpar - number of items we can shift to left/right * neighbor (including splitting item) * nset, lset, rset, lrset - shows, whether flowing items * give better packing */ #define FLOW 1 #define NO_FLOW 0 /* do not any splitting */ /* we choose one of the following */ #define NOTHING_SHIFT_NO_FLOW 0 #define NOTHING_SHIFT_FLOW 5 #define LEFT_SHIFT_NO_FLOW 10 #define LEFT_SHIFT_FLOW 15 #define RIGHT_SHIFT_NO_FLOW 20 #define RIGHT_SHIFT_FLOW 25 #define LR_SHIFT_NO_FLOW 30 #define LR_SHIFT_FLOW 35 lpar = tb->lnum[h]; rpar = tb->rnum[h]; /* * calculate number of blocks S[h] must be split into when * nothing is shifted to the neighbors, as well as number of * items in each part of the split node (s012 numbers), * and number of bytes (s1bytes) of the shared drop which * flow to S1 if any */ nset = NOTHING_SHIFT_NO_FLOW; nver = get_num_ver(vn->vn_mode, tb, h, 0, -1, h ? vn->vn_nr_item : 0, -1, snum012, NO_FLOW); if (!h) { int nver1; /* * note, that in this case we try to bottle * between S[0] and S1 (S1 - the first new node) */ nver1 = get_num_ver(vn->vn_mode, tb, h, 0, -1, 0, -1, snum012 + NOTHING_SHIFT_FLOW, FLOW); if (nver > nver1) nset = NOTHING_SHIFT_FLOW, nver = nver1; } /* * calculate number of blocks S[h] must be split into when * l_shift_num first items and l_shift_bytes of the right * most liquid item to be shifted are shifted to the left * neighbor, as well as number of items in each part of the * splitted node (s012 numbers), and number of bytes * (s1bytes) of the shared drop which flow to S1 if any */ lset = LEFT_SHIFT_NO_FLOW; lnver = get_num_ver(vn->vn_mode, tb, h, lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? vn->vn_nr_item : 0, -1, snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW); if (!h) { int lnver1; lnver1 = get_num_ver(vn->vn_mode, tb, h, lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, 0, -1, snum012 + LEFT_SHIFT_FLOW, FLOW); if (lnver > lnver1) lset = LEFT_SHIFT_FLOW, lnver = lnver1; } /* * calculate number of blocks S[h] must be split into when * r_shift_num first items and r_shift_bytes of the left most * liquid item to be shifted are shifted to the right neighbor, * as well as number of items in each part of the splitted * node (s012 numbers), and number of bytes (s1bytes) of the * shared drop which flow to S1 if any */ rset = RIGHT_SHIFT_NO_FLOW; rnver = get_num_ver(vn->vn_mode, tb, h, 0, -1, h ? (vn->vn_nr_item - rpar) : (rpar - ((tb-> rbytes != -1) ? 1 : 0)), -1, snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW); if (!h) { int rnver1; rnver1 = get_num_ver(vn->vn_mode, tb, h, 0, -1, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes, snum012 + RIGHT_SHIFT_FLOW, FLOW); if (rnver > rnver1) rset = RIGHT_SHIFT_FLOW, rnver = rnver1; } /* * calculate number of blocks S[h] must be split into when * items are shifted in both directions, as well as number * of items in each part of the splitted node (s012 numbers), * and number of bytes (s1bytes) of the shared drop which * flow to S1 if any */ lrset = LR_SHIFT_NO_FLOW; lrnver = get_num_ver(vn->vn_mode, tb, h, lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? (vn->vn_nr_item - rpar) : (rpar - ((tb-> rbytes != -1) ? 1 : 0)), -1, snum012 + LR_SHIFT_NO_FLOW, NO_FLOW); if (!h) { int lrnver1; lrnver1 = get_num_ver(vn->vn_mode, tb, h, lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes, snum012 + LR_SHIFT_FLOW, FLOW); if (lrnver > lrnver1) lrset = LR_SHIFT_FLOW, lrnver = lrnver1; } /* * Our general shifting strategy is: * 1) to minimized number of new nodes; * 2) to minimized number of neighbors involved in shifting; * 3) to minimized number of disk reads; */ /* we can win TWO or ONE nodes by shifting in both directions */ if (lrnver < lnver && lrnver < rnver) { RFALSE(h && (tb->lnum[h] != 1 || tb->rnum[h] != 1 || lrnver != 1 || rnver != 2 || lnver != 2 || h != 1), "vs-8230: bad h"); if (lrset == LR_SHIFT_FLOW) set_parameters(tb, h, tb->lnum[h], tb->rnum[h], lrnver, snum012 + lrset, tb->lbytes, tb->rbytes); else set_parameters(tb, h, tb->lnum[h] - ((tb->lbytes == -1) ? 0 : 1), tb->rnum[h] - ((tb->rbytes == -1) ? 0 : 1), lrnver, snum012 + lrset, -1, -1); return CARRY_ON; } /* * if shifting doesn't lead to better packing * then don't shift */ if (nver == lrnver) { set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1, -1); return CARRY_ON; } /* * now we know that for better packing shifting in only one * direction either to the left or to the right is required */ /* * if shifting to the left is better than * shifting to the right */ if (lnver < rnver) { SET_PAR_SHIFT_LEFT; return CARRY_ON; } /* * if shifting to the right is better than * shifting to the left */ if (lnver > rnver) { SET_PAR_SHIFT_RIGHT; return CARRY_ON; } /* * now shifting in either direction gives the same number * of nodes and we can make use of the cached neighbors */ if (is_left_neighbor_in_cache(tb, h)) { SET_PAR_SHIFT_LEFT; return CARRY_ON; } /* * shift to the right independently on whether the * right neighbor in cache or not */ SET_PAR_SHIFT_RIGHT; return CARRY_ON; } } /* * Check whether current node S[h] is balanced when Decreasing its size by * Deleting or Cutting for INTERNAL node of S+tree. * Calculate parameters for balancing for current level h. * Parameters: * tb tree_balance structure; * h current level of the node; * inum item number in S[h]; * mode i - insert, p - paste; * Returns: 1 - schedule occurred; * 0 - balancing for higher levels needed; * -1 - no balancing for higher levels needed; * -2 - no disk space. * * Note: Items of internal nodes have fixed size, so the balance condition for * the internal part of S+tree is as for the B-trees. */ static int dc_check_balance_internal(struct tree_balance *tb, int h) { struct virtual_node *vn = tb->tb_vn; /* * Sh is the node whose balance is currently being checked, * and Fh is its father. */ struct buffer_head *Sh, *Fh; int maxsize, ret; int lfree, rfree /* free space in L and R */ ; Sh = PATH_H_PBUFFER(tb->tb_path, h); Fh = PATH_H_PPARENT(tb->tb_path, h); maxsize = MAX_CHILD_SIZE(Sh); /* * using tb->insert_size[h], which is negative in this case, * create_virtual_node calculates: * new_nr_item = number of items node would have if operation is * performed without balancing (new_nr_item); */ create_virtual_node(tb, h); if (!Fh) { /* S[h] is the root. */ /* no balancing for higher levels needed */ if (vn->vn_nr_item > 0) { set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); return NO_BALANCING_NEEDED; } /* * new_nr_item == 0. * Current root will be deleted resulting in * decrementing the tree height. */ set_parameters(tb, h, 0, 0, 0, NULL, -1, -1); return CARRY_ON; } if ((ret = get_parents(tb, h)) != CARRY_ON) return ret; /* get free space of neighbors */ rfree = get_rfree(tb, h); lfree = get_lfree(tb, h); /* determine maximal number of items we can fit into neighbors */ check_left(tb, h, lfree); check_right(tb, h, rfree); /* * Balance condition for the internal node is valid. * In this case we balance only if it leads to better packing. */ if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { /* * Here we join S[h] with one of its neighbors, * which is impossible with greater values of new_nr_item. */ if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { /* All contents of S[h] can be moved to L[h]. */ if (tb->lnum[h] >= vn->vn_nr_item + 1) { int n; int order_L; order_L = ((n = PATH_H_B_ITEM_ORDER(tb->tb_path, h)) == 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE + KEY_SIZE); set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1); return CARRY_ON; } /* All contents of S[h] can be moved to R[h]. */ if (tb->rnum[h] >= vn->vn_nr_item + 1) { int n; int order_R; order_R = ((n = PATH_H_B_ITEM_ORDER(tb->tb_path, h)) == B_NR_ITEMS(Fh)) ? 0 : n + 1; n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE + KEY_SIZE); set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1); return CARRY_ON; } } /* * All contents of S[h] can be moved to the neighbors * (L[h] & R[h]). */ if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { int to_r; to_r = ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); return CARRY_ON; } /* Balancing does not lead to better packing. */ set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); return NO_BALANCING_NEEDED; } /* * Current node contain insufficient number of items. * Balancing is required. */ /* Check whether we can merge S[h] with left neighbor. */ if (tb->lnum[h] >= vn->vn_nr_item + 1) if (is_left_neighbor_in_cache(tb, h) || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) { int n; int order_L; order_L = ((n = PATH_H_B_ITEM_ORDER(tb->tb_path, h)) == 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE + KEY_SIZE); set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1); return CARRY_ON; } /* Check whether we can merge S[h] with right neighbor. */ if (tb->rnum[h] >= vn->vn_nr_item + 1) { int n; int order_R; order_R = ((n = PATH_H_B_ITEM_ORDER(tb->tb_path, h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1); n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE + KEY_SIZE); set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1); return CARRY_ON; } /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { int to_r; to_r = ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); return CARRY_ON; } /* For internal nodes try to borrow item from a neighbor */ RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root"); /* Borrow one or two items from caching neighbor */ if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) { int from_l; from_l = (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1) / 2 - (vn->vn_nr_item + 1); set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1); return CARRY_ON; } set_parameters(tb, h, 0, -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item + 1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1); return CARRY_ON; } /* * Check whether current node S[h] is balanced when Decreasing its size by * Deleting or Truncating for LEAF node of S+tree. * Calculate parameters for balancing for current level h. * Parameters: * tb tree_balance structure; * h current level of the node; * inum item number in S[h]; * mode i - insert, p - paste; * Returns: 1 - schedule occurred; * 0 - balancing for higher levels needed; * -1 - no balancing for higher levels needed; * -2 - no disk space. */ static int dc_check_balance_leaf(struct tree_balance *tb, int h) { struct virtual_node *vn = tb->tb_vn; /* * Number of bytes that must be deleted from * (value is negative if bytes are deleted) buffer which * contains node being balanced. The mnemonic is that the * attempted change in node space used level is levbytes bytes. */ int levbytes; /* the maximal item size */ int maxsize, ret; /* * S0 is the node whose balance is currently being checked, * and F0 is its father. */ struct buffer_head *S0, *F0; int lfree, rfree /* free space in L and R */ ; S0 = PATH_H_PBUFFER(tb->tb_path, 0); F0 = PATH_H_PPARENT(tb->tb_path, 0); levbytes = tb->insert_size[h]; maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */ if (!F0) { /* S[0] is the root now. */ RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0), "vs-8240: attempt to create empty buffer tree"); set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); return NO_BALANCING_NEEDED; } if ((ret = get_parents(tb, h)) != CARRY_ON) return ret; /* get free space of neighbors */ rfree = get_rfree(tb, h); lfree = get_lfree(tb, h); create_virtual_node(tb, h); /* if 3 leaves can be merge to one, set parameters and return */ if (are_leaves_removable(tb, lfree, rfree)) return CARRY_ON; /* * determine maximal number of items we can shift to the left/right * neighbor and the maximal number of bytes that can flow to the * left/right neighbor from the left/right most liquid item that * cannot be shifted from S[0] entirely */ check_left(tb, h, lfree); check_right(tb, h, rfree); /* check whether we can merge S with left neighbor. */ if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1) if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */ !tb->FR[h]) { RFALSE(!tb->FL[h], "vs-8245: dc_check_balance_leaf: FL[h] must exist"); /* set parameter to merge S[0] with its left neighbor */ set_parameters(tb, h, -1, 0, 0, NULL, -1, -1); return CARRY_ON; } /* check whether we can merge S[0] with right neighbor. */ if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) { set_parameters(tb, h, 0, -1, 0, NULL, -1, -1); return CARRY_ON; } /* * All contents of S[0] can be moved to the neighbors (L[0] & R[0]). * Set parameters and return */ if (is_leaf_removable(tb)) return CARRY_ON; /* Balancing is not required. */ tb->s0num = vn->vn_nr_item; set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); return NO_BALANCING_NEEDED; } /* * Check whether current node S[h] is balanced when Decreasing its size by * Deleting or Cutting. * Calculate parameters for balancing for current level h. * Parameters: * tb tree_balance structure; * h current level of the node; * inum item number in S[h]; * mode d - delete, c - cut. * Returns: 1 - schedule occurred; * 0 - balancing for higher levels needed; * -1 - no balancing for higher levels needed; * -2 - no disk space. */ static int dc_check_balance(struct tree_balance *tb, int h) { RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)), "vs-8250: S is not initialized"); if (h) return dc_check_balance_internal(tb, h); else return dc_check_balance_leaf(tb, h); } /* * Check whether current node S[h] is balanced. * Calculate parameters for balancing for current level h. * Parameters: * * tb tree_balance structure: * * tb is a large structure that must be read about in the header * file at the same time as this procedure if the reader is * to successfully understand this procedure * * h current level of the node; * inum item number in S[h]; * mode i - insert, p - paste, d - delete, c - cut. * Returns: 1 - schedule occurred; * 0 - balancing for higher levels needed; * -1 - no balancing for higher levels needed; * -2 - no disk space. */ static int check_balance(int mode, struct tree_balance *tb, int h, int inum, int pos_in_item, struct item_head *ins_ih, const void *data) { struct virtual_node *vn; vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf); vn->vn_free_ptr = (char *)(tb->tb_vn + 1); vn->vn_mode = mode; vn->vn_affected_item_num = inum; vn->vn_pos_in_item = pos_in_item; vn->vn_ins_ih = ins_ih; vn->vn_data = data; RFALSE(mode == M_INSERT && !vn->vn_ins_ih, "vs-8255: ins_ih can not be 0 in insert mode"); /* Calculate balance parameters when size of node is increasing. */ if (tb->insert_size[h] > 0) return ip_check_balance(tb, h); /* Calculate balance parameters when size of node is decreasing. */ return dc_check_balance(tb, h); } /* Check whether parent at the path is the really parent of the current node.*/ static int get_direct_parent(struct tree_balance *tb, int h) { struct buffer_head *bh; struct treepath *path = tb->tb_path; int position, path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h); /* We are in the root or in the new root. */ if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1, "PAP-8260: invalid offset in the path"); if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)-> b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) { /* Root is not changed. */ PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL; PATH_OFFSET_POSITION(path, path_offset - 1) = 0; return CARRY_ON; } /* Root is changed and we must recalculate the path. */ return REPEAT_SEARCH; } /* Parent in the path is not in the tree. */ if (!B_IS_IN_TREE (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1))) return REPEAT_SEARCH; if ((position = PATH_OFFSET_POSITION(path, path_offset - 1)) > B_NR_ITEMS(bh)) return REPEAT_SEARCH; /* Parent in the path is not parent of the current node in the tree. */ if (B_N_CHILD_NUM(bh, position) != PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr) return REPEAT_SEARCH; if (buffer_locked(bh)) { int depth = reiserfs_write_unlock_nested(tb->tb_sb); __wait_on_buffer(bh); reiserfs_write_lock_nested(tb->tb_sb, depth); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } /* * Parent in the path is unlocked and really parent * of the current node. */ return CARRY_ON; } /* * Using lnum[h] and rnum[h] we should determine what neighbors * of S[h] we * need in order to balance S[h], and get them if necessary. * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; * CARRY_ON - schedule didn't occur while the function worked; */ static int get_neighbors(struct tree_balance *tb, int h) { int child_position, path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1); unsigned long son_number; struct super_block *sb = tb->tb_sb; struct buffer_head *bh; int depth; PROC_INFO_INC(sb, get_neighbors[h]); if (tb->lnum[h]) { /* We need left neighbor to balance S[h]. */ PROC_INFO_INC(sb, need_l_neighbor[h]); bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); RFALSE(bh == tb->FL[h] && !PATH_OFFSET_POSITION(tb->tb_path, path_offset), "PAP-8270: invalid position in the parent"); child_position = (bh == tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> FL[h]); son_number = B_N_CHILD_NUM(tb->FL[h], child_position); depth = reiserfs_write_unlock_nested(tb->tb_sb); bh = sb_bread(sb, son_number); reiserfs_write_lock_nested(tb->tb_sb, depth); if (!bh) return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { brelse(bh); PROC_INFO_INC(sb, get_neighbors_restart[h]); return REPEAT_SEARCH; } RFALSE(!B_IS_IN_TREE(tb->FL[h]) || child_position > B_NR_ITEMS(tb->FL[h]) || B_N_CHILD_NUM(tb->FL[h], child_position) != bh->b_blocknr, "PAP-8275: invalid parent"); RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child"); RFALSE(!h && B_FREE_SPACE(bh) != MAX_CHILD_SIZE(bh) - dc_size(B_N_CHILD(tb->FL[0], child_position)), "PAP-8290: invalid child size of left neighbor"); brelse(tb->L[h]); tb->L[h] = bh; } /* We need right neighbor to balance S[path_offset]. */ if (tb->rnum[h]) { PROC_INFO_INC(sb, need_r_neighbor[h]); bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); RFALSE(bh == tb->FR[h] && PATH_OFFSET_POSITION(tb->tb_path, path_offset) >= B_NR_ITEMS(bh), "PAP-8295: invalid position in the parent"); child_position = (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; son_number = B_N_CHILD_NUM(tb->FR[h], child_position); depth = reiserfs_write_unlock_nested(tb->tb_sb); bh = sb_bread(sb, son_number); reiserfs_write_lock_nested(tb->tb_sb, depth); if (!bh) return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { brelse(bh); PROC_INFO_INC(sb, get_neighbors_restart[h]); return REPEAT_SEARCH; } brelse(tb->R[h]); tb->R[h] = bh; RFALSE(!h && B_FREE_SPACE(bh) != MAX_CHILD_SIZE(bh) - dc_size(B_N_CHILD(tb->FR[0], child_position)), "PAP-8300: invalid child size of right neighbor (%d != %d - %d)", B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh), dc_size(B_N_CHILD(tb->FR[0], child_position))); } return CARRY_ON; } static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) { int max_num_of_items; int max_num_of_entries; unsigned long blocksize = sb->s_blocksize; #define MIN_NAME_LEN 1 max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN); max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) / (DEH_SIZE + MIN_NAME_LEN); return sizeof(struct virtual_node) + max(max_num_of_items * sizeof(struct virtual_item), sizeof(struct virtual_item) + sizeof(struct direntry_uarea) + (max_num_of_entries - 1) * sizeof(__u16)); } /* * maybe we should fail balancing we are going to perform when kmalloc * fails several times. But now it will loop until kmalloc gets * required memory */ static int get_mem_for_virtual_node(struct tree_balance *tb) { int check_fs = 0; int size; char *buf; size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path)); /* we have to allocate more memory for virtual node */ if (size > tb->vn_buf_size) { if (tb->vn_buf) { /* free memory allocated before */ kfree(tb->vn_buf); /* this is not needed if kfree is atomic */ check_fs = 1; } /* virtual node requires now more memory */ tb->vn_buf_size = size; /* get memory for virtual item */ buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); if (!buf) { /* * getting memory with GFP_KERNEL priority may involve * balancing now (due to indirect_to_direct conversion * on dcache shrinking). So, release path and collected * resources here */ free_buffers_in_tb(tb); buf = kmalloc(size, GFP_NOFS); if (!buf) { tb->vn_buf_size = 0; } tb->vn_buf = buf; schedule(); return REPEAT_SEARCH; } tb->vn_buf = buf; } if (check_fs && FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; return CARRY_ON; } #ifdef CONFIG_REISERFS_CHECK static void tb_buffer_sanity_check(struct super_block *sb, struct buffer_head *bh, const char *descr, int level) { if (bh) { if (atomic_read(&(bh->b_count)) <= 0) reiserfs_panic(sb, "jmacd-1", "negative or zero " "reference counter for buffer %s[%d] " "(%b)", descr, level, bh); if (!buffer_uptodate(bh)) reiserfs_panic(sb, "jmacd-2", "buffer is not up " "to date %s[%d] (%b)", descr, level, bh); if (!B_IS_IN_TREE(bh)) reiserfs_panic(sb, "jmacd-3", "buffer is not " "in tree %s[%d] (%b)", descr, level, bh); if (bh->b_bdev != sb->s_bdev) reiserfs_panic(sb, "jmacd-4", "buffer has wrong " "device %s[%d] (%b)", descr, level, bh); if (bh->b_size != sb->s_blocksize) reiserfs_panic(sb, "jmacd-5", "buffer has wrong " "blocksize %s[%d] (%b)", descr, level, bh); if (bh->b_blocknr > SB_BLOCK_COUNT(sb)) reiserfs_panic(sb, "jmacd-6", "buffer block " "number too high %s[%d] (%b)", descr, level, bh); } } #else static void tb_buffer_sanity_check(struct super_block *sb, struct buffer_head *bh, const char *descr, int level) {; } #endif static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh) { return reiserfs_prepare_for_journal(s, bh, 0); } static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) { struct buffer_head *locked; #ifdef CONFIG_REISERFS_CHECK int repeat_counter = 0; #endif int i; do { locked = NULL; for (i = tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) { if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) { /* * if I understand correctly, we can only * be sure the last buffer in the path is * in the tree --clm */ #ifdef CONFIG_REISERFS_CHECK if (PATH_PLAST_BUFFER(tb->tb_path) == PATH_OFFSET_PBUFFER(tb->tb_path, i)) tb_buffer_sanity_check(tb->tb_sb, PATH_OFFSET_PBUFFER (tb->tb_path, i), "S", tb->tb_path-> path_length - i); #endif if (!clear_all_dirty_bits(tb->tb_sb, PATH_OFFSET_PBUFFER (tb->tb_path, i))) { locked = PATH_OFFSET_PBUFFER(tb->tb_path, i); } } } for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i]; i++) { if (tb->lnum[i]) { if (tb->L[i]) { tb_buffer_sanity_check(tb->tb_sb, tb->L[i], "L", i); if (!clear_all_dirty_bits (tb->tb_sb, tb->L[i])) locked = tb->L[i]; } if (!locked && tb->FL[i]) { tb_buffer_sanity_check(tb->tb_sb, tb->FL[i], "FL", i); if (!clear_all_dirty_bits (tb->tb_sb, tb->FL[i])) locked = tb->FL[i]; } if (!locked && tb->CFL[i]) { tb_buffer_sanity_check(tb->tb_sb, tb->CFL[i], "CFL", i); if (!clear_all_dirty_bits (tb->tb_sb, tb->CFL[i])) locked = tb->CFL[i]; } } if (!locked && (tb->rnum[i])) { if (tb->R[i]) { tb_buffer_sanity_check(tb->tb_sb, tb->R[i], "R", i); if (!clear_all_dirty_bits (tb->tb_sb, tb->R[i])) locked = tb->R[i]; } if (!locked && tb->FR[i]) { tb_buffer_sanity_check(tb->tb_sb, tb->FR[i], "FR", i); if (!clear_all_dirty_bits (tb->tb_sb, tb->FR[i])) locked = tb->FR[i]; } if (!locked && tb->CFR[i]) { tb_buffer_sanity_check(tb->tb_sb, tb->CFR[i], "CFR", i); if (!clear_all_dirty_bits (tb->tb_sb, tb->CFR[i])) locked = tb->CFR[i]; } } } /* * as far as I can tell, this is not required. The FEB list * seems to be full of newly allocated nodes, which will * never be locked, dirty, or anything else. * To be safe, I'm putting in the checks and waits in. * For the moment, they are needed to keep the code in * journal.c from complaining about the buffer. * That code is inside CONFIG_REISERFS_CHECK as well. --clm */ for (i = 0; !locked && i < MAX_FEB_SIZE; i++) { if (tb->FEB[i]) { if (!clear_all_dirty_bits (tb->tb_sb, tb->FEB[i])) locked = tb->FEB[i]; } } if (locked) { int depth; #ifdef CONFIG_REISERFS_CHECK repeat_counter++; if ((repeat_counter % 10000) == 0) { reiserfs_warning(tb->tb_sb, "reiserfs-8200", "too many iterations waiting " "for buffer to unlock " "(%b)", locked); /* Don't loop forever. Try to recover from possible error. */ return (FILESYSTEM_CHANGED_TB(tb)) ? REPEAT_SEARCH : CARRY_ON; } #endif depth = reiserfs_write_unlock_nested(tb->tb_sb); __wait_on_buffer(locked); reiserfs_write_lock_nested(tb->tb_sb, depth); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } } while (locked); return CARRY_ON; } /* * Prepare for balancing, that is * get all necessary parents, and neighbors; * analyze what and where should be moved; * get sufficient number of new nodes; * Balancing will start only after all resources will be collected at a time. * * When ported to SMP kernels, only at the last moment after all needed nodes * are collected in cache, will the resources be locked using the usual * textbook ordered lock acquisition algorithms. Note that ensuring that * this code neither write locks what it does not need to write lock nor locks * out of order will be a pain in the butt that could have been avoided. * Grumble grumble. -Hans * * fix is meant in the sense of render unchanging * * Latency might be improved by first gathering a list of what buffers * are needed and then getting as many of them in parallel as possible? -Hans * * Parameters: * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append) * tb tree_balance structure; * inum item number in S[h]; * pos_in_item - comment this if you can * ins_ih item head of item being inserted * data inserted item or data to be pasted * Returns: 1 - schedule occurred while the function worked; * 0 - schedule didn't occur while the function worked; * -1 - if no_disk_space */ int fix_nodes(int op_mode, struct tree_balance *tb, struct item_head *ins_ih, const void *data) { int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path); int pos_in_item; /* * we set wait_tb_buffers_run when we have to restore any dirty * bits cleared during wait_tb_buffers_run */ int wait_tb_buffers_run = 0; struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); ++REISERFS_SB(tb->tb_sb)->s_fix_nodes; pos_in_item = tb->tb_path->pos_in_item; tb->fs_gen = get_generation(tb->tb_sb); /* * we prepare and log the super here so it will already be in the * transaction when do_balance needs to change it. * This way do_balance won't have to schedule when trying to prepare * the super for logging */ reiserfs_prepare_for_journal(tb->tb_sb, SB_BUFFER_WITH_SB(tb->tb_sb), 1); journal_mark_dirty(tb->transaction_handle, SB_BUFFER_WITH_SB(tb->tb_sb)); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; /* if it possible in indirect_to_direct conversion */ if (buffer_locked(tbS0)) { int depth = reiserfs_write_unlock_nested(tb->tb_sb); __wait_on_buffer(tbS0); reiserfs_write_lock_nested(tb->tb_sb, depth); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } #ifdef CONFIG_REISERFS_CHECK if (REISERFS_SB(tb->tb_sb)->cur_tb) { print_cur_tb("fix_nodes"); reiserfs_panic(tb->tb_sb, "PAP-8305", "there is pending do_balance"); } if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0)) reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is " "not uptodate at the beginning of fix_nodes " "or not in tree (mode %c)", tbS0, tbS0, op_mode); /* Check parameters. */ switch (op_mode) { case M_INSERT: if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0)) reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect " "item number %d (in S0 - %d) in case " "of insert", item_num, B_NR_ITEMS(tbS0)); break; case M_PASTE: case M_DELETE: case M_CUT: if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) { print_block(tbS0, 0, -1, -1); reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect " "item number(%d); mode = %c " "insert_size = %d", item_num, op_mode, tb->insert_size[0]); } break; default: reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode " "of operation"); } #endif if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH) /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */ return REPEAT_SEARCH; /* Starting from the leaf level; for all levels h of the tree. */ for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) { ret = get_direct_parent(tb, h); if (ret != CARRY_ON) goto repeat; ret = check_balance(op_mode, tb, h, item_num, pos_in_item, ins_ih, data); if (ret != CARRY_ON) { if (ret == NO_BALANCING_NEEDED) { /* No balancing for higher levels needed. */ ret = get_neighbors(tb, h); if (ret != CARRY_ON) goto repeat; if (h != MAX_HEIGHT - 1) tb->insert_size[h + 1] = 0; /* * ok, analysis and resource gathering * are complete */ break; } goto repeat; } ret = get_neighbors(tb, h); if (ret != CARRY_ON) goto repeat; /* * No disk space, or schedule occurred and analysis may be * invalid and needs to be redone. */ ret = get_empty_nodes(tb, h); if (ret != CARRY_ON) goto repeat; /* * We have a positive insert size but no nodes exist on this * level, this means that we are creating a new root. */ if (!PATH_H_PBUFFER(tb->tb_path, h)) { RFALSE(tb->blknum[h] != 1, "PAP-8350: creating new empty root"); if (h < MAX_HEIGHT - 1) tb->insert_size[h + 1] = 0; } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) { /* * The tree needs to be grown, so this node S[h] * which is the root node is split into two nodes, * and a new node (S[h+1]) will be created to * become the root node. */ if (tb->blknum[h] > 1) { RFALSE(h == MAX_HEIGHT - 1, "PAP-8355: attempt to create too high of a tree"); tb->insert_size[h + 1] = (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1) + DC_SIZE; } else if (h < MAX_HEIGHT - 1) tb->insert_size[h + 1] = 0; } else tb->insert_size[h + 1] = (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1); } ret = wait_tb_buffers_until_unlocked(tb); if (ret == CARRY_ON) { if (FILESYSTEM_CHANGED_TB(tb)) { wait_tb_buffers_run = 1; ret = REPEAT_SEARCH; goto repeat; } else { return CARRY_ON; } } else { wait_tb_buffers_run = 1; goto repeat; } repeat: /* * fix_nodes was unable to perform its calculation due to * filesystem got changed under us, lack of free disk space or i/o * failure. If the first is the case - the search will be * repeated. For now - free all resources acquired so far except * for the new allocated nodes */ { int i; /* Release path buffers. */ if (wait_tb_buffers_run) { pathrelse_and_restore(tb->tb_sb, tb->tb_path); } else { pathrelse(tb->tb_path); } /* brelse all resources collected for balancing */ for (i = 0; i < MAX_HEIGHT; i++) { if (wait_tb_buffers_run) { reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]); reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]); reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]); reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]); reiserfs_restore_prepared_buffer(tb->tb_sb, tb-> CFL[i]); reiserfs_restore_prepared_buffer(tb->tb_sb, tb-> CFR[i]); } brelse(tb->L[i]); brelse(tb->R[i]); brelse(tb->FL[i]); brelse(tb->FR[i]); brelse(tb->CFL[i]); brelse(tb->CFR[i]); tb->L[i] = NULL; tb->R[i] = NULL; tb->FL[i] = NULL; tb->FR[i] = NULL; tb->CFL[i] = NULL; tb->CFR[i] = NULL; } if (wait_tb_buffers_run) { for (i = 0; i < MAX_FEB_SIZE; i++) { if (tb->FEB[i]) reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FEB[i]); } } return ret; } } void unfix_nodes(struct tree_balance *tb) { int i; /* Release path buffers. */ pathrelse_and_restore(tb->tb_sb, tb->tb_path); /* brelse all resources collected for balancing */ for (i = 0; i < MAX_HEIGHT; i++) { reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]); reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]); reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]); reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]); reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]); reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]); brelse(tb->L[i]); brelse(tb->R[i]); brelse(tb->FL[i]); brelse(tb->FR[i]); brelse(tb->CFL[i]); brelse(tb->CFR[i]); } /* deal with list of allocated (used and unused) nodes */ for (i = 0; i < MAX_FEB_SIZE; i++) { if (tb->FEB[i]) { b_blocknr_t blocknr = tb->FEB[i]->b_blocknr; /* * de-allocated block which was not used by * balancing and bforget about buffer for it */ brelse(tb->FEB[i]); reiserfs_free_block(tb->transaction_handle, NULL, blocknr, 0); } if (tb->used[i]) { /* release used as new nodes including a new root */ brelse(tb->used[i]); } } kfree(tb->vn_buf); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 | /* * Access kernel memory without faulting. */ #include <linux/export.h> #include <linux/mm.h> #include <linux/uaccess.h> static __always_inline long probe_read_common(void *dst, const void __user *src, size_t size) { long ret; pagefault_disable(); ret = __copy_from_user_inatomic(dst, src, size); pagefault_enable(); return ret ? -EFAULT : 0; } static __always_inline long probe_write_common(void __user *dst, const void *src, size_t size) { long ret; pagefault_disable(); ret = __copy_to_user_inatomic(dst, src, size); pagefault_enable(); return ret ? -EFAULT : 0; } /** * probe_kernel_read(): safely attempt to read from a kernel-space location * @dst: pointer to the buffer that shall take the data * @src: address to read from * @size: size of the data chunk * * Safely read from address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. * * We ensure that the copy_from_user is executed in atomic context so that * do_page_fault() doesn't attempt to take mmap_sem. This makes * probe_kernel_read() suitable for use within regions where the caller * already holds mmap_sem, or other locks which nest inside mmap_sem. */ long __weak probe_kernel_read(void *dst, const void *src, size_t size) __attribute__((alias("__probe_kernel_read"))); long __probe_kernel_read(void *dst, const void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); ret = probe_read_common(dst, (__force const void __user *)src, size); set_fs(old_fs); return ret; } EXPORT_SYMBOL_GPL(probe_kernel_read); /** * probe_user_read(): safely attempt to read from a user-space location * @dst: pointer to the buffer that shall take the data * @src: address to read from. This must be a user address. * @size: size of the data chunk * * Safely read from user address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ long __weak probe_user_read(void *dst, const void __user *src, size_t size) __attribute__((alias("__probe_user_read"))); long __probe_user_read(void *dst, const void __user *src, size_t size) { long ret = -EFAULT; mm_segment_t old_fs = get_fs(); set_fs(USER_DS); if (access_ok(VERIFY_READ, src, size)) ret = probe_read_common(dst, src, size); set_fs(old_fs); return ret; } EXPORT_SYMBOL_GPL(probe_user_read); /** * probe_kernel_write(): safely attempt to write to a location * @dst: address to write to * @src: pointer to the data that shall be written * @size: size of the data chunk * * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ long __weak probe_kernel_write(void *dst, const void *src, size_t size) __attribute__((alias("__probe_kernel_write"))); long __probe_kernel_write(void *dst, const void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); ret = probe_write_common((__force void __user *)dst, src, size); set_fs(old_fs); return ret; } EXPORT_SYMBOL_GPL(probe_kernel_write); /** * probe_user_write(): safely attempt to write to a user-space location * @dst: address to write to * @src: pointer to the data that shall be written * @size: size of the data chunk * * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ long __weak probe_user_write(void __user *dst, const void *src, size_t size) __attribute__((alias("__probe_user_write"))); long __probe_user_write(void __user *dst, const void *src, size_t size) { long ret = -EFAULT; mm_segment_t old_fs = get_fs(); set_fs(USER_DS); if (access_ok(VERIFY_WRITE, dst, size)) ret = probe_write_common(dst, src, size); set_fs(old_fs); return ret; } EXPORT_SYMBOL_GPL(probe_user_write); /** * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. * @unsafe_addr: Unsafe address. * @count: Maximum number of bytes to copy, including the trailing NUL. * * Copies a NUL-terminated string from unsafe address to kernel buffer. * * On success, returns the length of the string INCLUDING the trailing NUL. * * If access fails, returns -EFAULT (some data may have been copied * and the trailing NUL added). * * If @count is smaller than the length of the string, copies @count-1 bytes, * sets the last byte of @dst buffer to NUL and returns @count. */ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) { mm_segment_t old_fs = get_fs(); const void *src = unsafe_addr; long ret; if (unlikely(count <= 0)) return 0; set_fs(KERNEL_DS); pagefault_disable(); do { ret = __get_user(*dst++, (const char __user __force *)src++); } while (dst[-1] && ret == 0 && src - unsafe_addr < count); dst[-1] = '\0'; pagefault_enable(); set_fs(old_fs); return ret ? -EFAULT : src - unsafe_addr; } /** * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user * address. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. * @unsafe_addr: Unsafe user address. * @count: Maximum number of bytes to copy, including the trailing NUL. * * Copies a NUL-terminated string from unsafe user address to kernel buffer. * * On success, returns the length of the string INCLUDING the trailing NUL. * * If access fails, returns -EFAULT (some data may have been copied * and the trailing NUL added). * * If @count is smaller than the length of the string, copies @count-1 bytes, * sets the last byte of @dst buffer to NUL and returns @count. */ long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, long count) { mm_segment_t old_fs = get_fs(); long ret; if (unlikely(count <= 0)) return 0; set_fs(USER_DS); pagefault_disable(); ret = strncpy_from_user(dst, unsafe_addr, count); pagefault_enable(); set_fs(old_fs); if (ret >= count) { ret = count; dst[ret - 1] = '\0'; } else if (ret > 0) { ret++; } return ret; } /** * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL. * @unsafe_addr: The string to measure. * @count: Maximum count (including NUL) * * Get the size of a NUL-terminated string in user space without pagefault. * * Returns the size of the string INCLUDING the terminating NUL. * * If the string is too long, returns a number larger than @count. User * has to check the return value against "> count". * On exception (or invalid count), returns 0. * * Unlike strnlen_user, this can be used from IRQ handler etc. because * it disables pagefaults. */ long strnlen_unsafe_user(const void __user *unsafe_addr, long count) { mm_segment_t old_fs = get_fs(); int ret; set_fs(USER_DS); pagefault_disable(); ret = strnlen_user(unsafe_addr, count); pagefault_enable(); set_fs(old_fs); return ret; } |
7 7 7 1 6 6 4 2 1 6 6 6 7 1 3 3 7 7 3 6 9 8 1 1 7 7 7 7 7 7 6 6 1 4 5 5 5 5 5 5 5 5 1 4 4 4 4 4 2 3 5 9 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 | /* * fs/bfs/inode.c * BFS superblock and inode operations. * Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com> * From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds. * * Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005. */ #include <linux/module.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/writeback.h> #include <linux/uio.h> #include <linux/uaccess.h> #include "bfs.h" MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>"); MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux"); MODULE_LICENSE("GPL"); #undef DEBUG #ifdef DEBUG #define dprintf(x...) printf(x) #else #define dprintf(x...) #endif struct inode *bfs_iget(struct super_block *sb, unsigned long ino) { struct bfs_inode *di; struct inode *inode; struct buffer_head *bh; int block, off; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) { printf("Bad inode number %s:%08lx\n", inode->i_sb->s_id, ino); goto error; } block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; bh = sb_bread(inode->i_sb, block); if (!bh) { printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, ino); goto error; } off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; di = (struct bfs_inode *)bh->b_data + off; inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode); if (le32_to_cpu(di->i_vtype) == BFS_VDIR) { inode->i_mode |= S_IFDIR; inode->i_op = &bfs_dir_inops; inode->i_fop = &bfs_dir_operations; } else if (le32_to_cpu(di->i_vtype) == BFS_VREG) { inode->i_mode |= S_IFREG; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; } BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); BFS_I(inode)->i_eblock = le32_to_cpu(di->i_eblock); BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); i_uid_write(inode, le32_to_cpu(di->i_uid)); i_gid_write(inode, le32_to_cpu(di->i_gid)); set_nlink(inode, le32_to_cpu(di->i_nlink)); inode->i_size = BFS_FILESIZE(di); inode->i_blocks = BFS_FILEBLOCKS(di); inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime); inode->i_ctime.tv_sec = le32_to_cpu(di->i_ctime); inode->i_atime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; brelse(bh); unlock_new_inode(inode); return inode; error: iget_failed(inode); return ERR_PTR(-EIO); } static struct bfs_inode *find_inode(struct super_block *sb, u16 ino, struct buffer_head **p) { if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(sb)->si_lasti)) { printf("Bad inode number %s:%08x\n", sb->s_id, ino); return ERR_PTR(-EIO); } ino -= BFS_ROOT_INO; *p = sb_bread(sb, 1 + ino / BFS_INODES_PER_BLOCK); if (!*p) { printf("Unable to read inode %s:%08x\n", sb->s_id, ino); return ERR_PTR(-EIO); } return (struct bfs_inode *)(*p)->b_data + ino % BFS_INODES_PER_BLOCK; } static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct bfs_sb_info *info = BFS_SB(inode->i_sb); unsigned int ino = (u16)inode->i_ino; unsigned long i_sblock; struct bfs_inode *di; struct buffer_head *bh; int err = 0; dprintf("ino=%08x\n", ino); di = find_inode(inode->i_sb, ino, &bh); if (IS_ERR(di)) return PTR_ERR(di); mutex_lock(&info->bfs_lock); if (ino == BFS_ROOT_INO) di->i_vtype = cpu_to_le32(BFS_VDIR); else di->i_vtype = cpu_to_le32(BFS_VREG); di->i_ino = cpu_to_le16(ino); di->i_mode = cpu_to_le32(inode->i_mode); di->i_uid = cpu_to_le32(i_uid_read(inode)); di->i_gid = cpu_to_le32(i_gid_read(inode)); di->i_nlink = cpu_to_le32(inode->i_nlink); di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); i_sblock = BFS_I(inode)->i_sblock; di->i_sblock = cpu_to_le32(i_sblock); di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock); di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); mark_buffer_dirty(bh); if (wbc->sync_mode == WB_SYNC_ALL) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) err = -EIO; } brelse(bh); mutex_unlock(&info->bfs_lock); return err; } static void bfs_evict_inode(struct inode *inode) { unsigned long ino = inode->i_ino; struct bfs_inode *di; struct buffer_head *bh; struct super_block *s = inode->i_sb; struct bfs_sb_info *info = BFS_SB(s); struct bfs_inode_info *bi = BFS_I(inode); dprintf("ino=%08lx\n", ino); truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); if (inode->i_nlink) return; di = find_inode(s, inode->i_ino, &bh); if (IS_ERR(di)) return; mutex_lock(&info->bfs_lock); /* clear on-disk inode */ memset(di, 0, sizeof(struct bfs_inode)); mark_buffer_dirty(bh); brelse(bh); if (bi->i_dsk_ino) { if (bi->i_sblock) info->si_freeb += bi->i_eblock + 1 - bi->i_sblock; info->si_freei++; clear_bit(ino, info->si_imap); bfs_dump_imap("delete_inode", s); } /* * If this was the last file, make the previous block * "last block of the last file" even if there is no * real file there, saves us 1 gap. */ if (info->si_lf_eblk == bi->i_eblock) info->si_lf_eblk = bi->i_sblock - 1; mutex_unlock(&info->bfs_lock); } static void bfs_put_super(struct super_block *s) { struct bfs_sb_info *info = BFS_SB(s); if (!info) return; mutex_destroy(&info->bfs_lock); kfree(info->si_imap); kfree(info); s->s_fs_info = NULL; } static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct bfs_sb_info *info = BFS_SB(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); buf->f_type = BFS_MAGIC; buf->f_bsize = s->s_blocksize; buf->f_blocks = info->si_blocks; buf->f_bfree = buf->f_bavail = info->si_freeb; buf->f_files = info->si_lasti + 1 - BFS_ROOT_INO; buf->f_ffree = info->si_freei; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); buf->f_namelen = BFS_NAMELEN; return 0; } static struct kmem_cache *bfs_inode_cachep; static struct inode *bfs_alloc_inode(struct super_block *sb) { struct bfs_inode_info *bi; bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; } static void bfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); } static void bfs_destroy_inode(struct inode *inode) { call_rcu(&inode->i_rcu, bfs_i_callback); } static void init_once(void *foo) { struct bfs_inode_info *bi = foo; inode_init_once(&bi->vfs_inode); } static int __init init_inodecache(void) { bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (bfs_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(bfs_inode_cachep); } static const struct super_operations bfs_sops = { .alloc_inode = bfs_alloc_inode, .destroy_inode = bfs_destroy_inode, .write_inode = bfs_write_inode, .evict_inode = bfs_evict_inode, .put_super = bfs_put_super, .statfs = bfs_statfs, }; void bfs_dump_imap(const char *prefix, struct super_block *s) { #ifdef DEBUG int i; char *tmpbuf = (char *)get_zeroed_page(GFP_KERNEL); if (!tmpbuf) return; for (i = BFS_SB(s)->si_lasti; i >= 0; i--) { if (i > PAGE_SIZE - 100) break; if (test_bit(i, BFS_SB(s)->si_imap)) strcat(tmpbuf, "1"); else strcat(tmpbuf, "0"); } printf("BFS-fs: %s: lasti=%08lx <%s>\n", prefix, BFS_SB(s)->si_lasti, tmpbuf); free_page((unsigned long)tmpbuf); #endif } static int bfs_fill_super(struct super_block *s, void *data, int silent) { struct buffer_head *bh, *sbh; struct bfs_super_block *bfs_sb; struct inode *inode; unsigned i, imap_len; struct bfs_sb_info *info; int ret = -EINVAL; unsigned long i_sblock, i_eblock, i_eoff, s_size; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; mutex_init(&info->bfs_lock); s->s_fs_info = info; sb_set_blocksize(s, BFS_BSIZE); sbh = sb_bread(s, 0); if (!sbh) goto out; bfs_sb = (struct bfs_super_block *)sbh->b_data; if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { if (!silent) printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id, le32_to_cpu(bfs_sb->s_magic)); goto out1; } if (BFS_UNCLEAN(bfs_sb, s) && !silent) printf("%s is unclean, continuing\n", s->s_id); s->s_magic = BFS_MAGIC; if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) || le32_to_cpu(bfs_sb->s_start) < BFS_BSIZE) { printf("Superblock is corrupted\n"); goto out1; } info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1; imap_len = (info->si_lasti / 8) + 1; info->si_imap = kzalloc(imap_len, GFP_KERNEL | __GFP_NOWARN); if (!info->si_imap) { printf("Cannot allocate %u bytes\n", imap_len); goto out1; } for (i = 0; i < BFS_ROOT_INO; i++) set_bit(i, info->si_imap); s->s_op = &bfs_sops; inode = bfs_iget(s, BFS_ROOT_INO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto out2; } s->s_root = d_make_root(inode); if (!s->s_root) { ret = -ENOMEM; goto out2; } info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; info->si_freei = 0; info->si_lf_eblk = 0; /* can we read the last block? */ bh = sb_bread(s, info->si_blocks - 1); if (!bh) { printf("Last block not available: %lu\n", info->si_blocks - 1); ret = -EIO; goto out3; } brelse(bh); bh = NULL; for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { struct bfs_inode *di; int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; unsigned long eblock; if (!off) { brelse(bh); bh = sb_bread(s, block); } if (!bh) continue; di = (struct bfs_inode *)bh->b_data + off; /* test if filesystem is not corrupted */ i_eoff = le32_to_cpu(di->i_eoffset); i_sblock = le32_to_cpu(di->i_sblock); i_eblock = le32_to_cpu(di->i_eblock); s_size = le32_to_cpu(bfs_sb->s_end); if (i_sblock > info->si_blocks || i_eblock > info->si_blocks || i_sblock > i_eblock || (i_eoff != le32_to_cpu(-1) && i_eoff > s_size) || i_sblock * BFS_BSIZE > i_eoff) { printf("Inode 0x%08x corrupted\n", i); brelse(bh); ret = -EIO; goto out3; } if (!di->i_ino) { info->si_freei++; continue; } set_bit(i, info->si_imap); info->si_freeb -= BFS_FILEBLOCKS(di); eblock = le32_to_cpu(di->i_eblock); if (eblock > info->si_lf_eblk) info->si_lf_eblk = eblock; } brelse(bh); brelse(sbh); bfs_dump_imap("read_super", s); return 0; out3: dput(s->s_root); s->s_root = NULL; out2: kfree(info->si_imap); out1: brelse(sbh); out: mutex_destroy(&info->bfs_lock); kfree(info); s->s_fs_info = NULL; return ret; } static struct dentry *bfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super); } static struct file_system_type bfs_fs_type = { .owner = THIS_MODULE, .name = "bfs", .mount = bfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("bfs"); static int __init init_bfs_fs(void) { int err = init_inodecache(); if (err) goto out1; err = register_filesystem(&bfs_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_bfs_fs(void) { unregister_filesystem(&bfs_fs_type); destroy_inodecache(); } module_init(init_bfs_fs) module_exit(exit_bfs_fs) |
3 3 3 3 3 3 3 2 2 3 2 3 3 3 2 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | /* * Salsa20: Salsa20 stream cipher algorithm * * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com> * * Derived from: * - salsa20.c: Public domain C code by Daniel J. Bernstein <djb@cr.yp.to> * * Salsa20 is a stream cipher candidate in eSTREAM, the ECRYPT Stream * Cipher Project. It is designed by Daniel J. Bernstein <djb@cr.yp.to>. * More information about eSTREAM and Salsa20 can be found here: * http://www.ecrypt.eu.org/stream/ * http://cr.yp.to/snuffle.html * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #include <asm/unaligned.h> #include <crypto/internal/skcipher.h> #include <linux/module.h> #define SALSA20_IV_SIZE 8 #define SALSA20_MIN_KEY_SIZE 16 #define SALSA20_MAX_KEY_SIZE 32 #define SALSA20_BLOCK_SIZE 64 struct salsa20_ctx { u32 initial_state[16]; }; static void salsa20_block(u32 *state, __le32 *stream) { u32 x[16]; int i; memcpy(x, state, sizeof(x)); for (i = 0; i < 20; i += 2) { x[ 4] ^= rol32((x[ 0] + x[12]), 7); x[ 8] ^= rol32((x[ 4] + x[ 0]), 9); x[12] ^= rol32((x[ 8] + x[ 4]), 13); x[ 0] ^= rol32((x[12] + x[ 8]), 18); x[ 9] ^= rol32((x[ 5] + x[ 1]), 7); x[13] ^= rol32((x[ 9] + x[ 5]), 9); x[ 1] ^= rol32((x[13] + x[ 9]), 13); x[ 5] ^= rol32((x[ 1] + x[13]), 18); x[14] ^= rol32((x[10] + x[ 6]), 7); x[ 2] ^= rol32((x[14] + x[10]), 9); x[ 6] ^= rol32((x[ 2] + x[14]), 13); x[10] ^= rol32((x[ 6] + x[ 2]), 18); x[ 3] ^= rol32((x[15] + x[11]), 7); x[ 7] ^= rol32((x[ 3] + x[15]), 9); x[11] ^= rol32((x[ 7] + x[ 3]), 13); x[15] ^= rol32((x[11] + x[ 7]), 18); x[ 1] ^= rol32((x[ 0] + x[ 3]), 7); x[ 2] ^= rol32((x[ 1] + x[ 0]), 9); x[ 3] ^= rol32((x[ 2] + x[ 1]), 13); x[ 0] ^= rol32((x[ 3] + x[ 2]), 18); x[ 6] ^= rol32((x[ 5] + x[ 4]), 7); x[ 7] ^= rol32((x[ 6] + x[ 5]), 9); x[ 4] ^= rol32((x[ 7] + x[ 6]), 13); x[ 5] ^= rol32((x[ 4] + x[ 7]), 18); x[11] ^= rol32((x[10] + x[ 9]), 7); x[ 8] ^= rol32((x[11] + x[10]), 9); x[ 9] ^= rol32((x[ 8] + x[11]), 13); x[10] ^= rol32((x[ 9] + x[ 8]), 18); x[12] ^= rol32((x[15] + x[14]), 7); x[13] ^= rol32((x[12] + x[15]), 9); x[14] ^= rol32((x[13] + x[12]), 13); x[15] ^= rol32((x[14] + x[13]), 18); } for (i = 0; i < 16; i++) stream[i] = cpu_to_le32(x[i] + state[i]); if (++state[8] == 0) state[9]++; } static void salsa20_docrypt(u32 *state, u8 *dst, const u8 *src, unsigned int bytes) { __le32 stream[SALSA20_BLOCK_SIZE / sizeof(__le32)]; if (dst != src) memcpy(dst, src, bytes); while (bytes >= SALSA20_BLOCK_SIZE) { salsa20_block(state, stream); crypto_xor(dst, (const u8 *)stream, SALSA20_BLOCK_SIZE); bytes -= SALSA20_BLOCK_SIZE; dst += SALSA20_BLOCK_SIZE; } if (bytes) { salsa20_block(state, stream); crypto_xor(dst, (const u8 *)stream, bytes); } } static void salsa20_init(u32 *state, const struct salsa20_ctx *ctx, const u8 *iv) { memcpy(state, ctx->initial_state, sizeof(ctx->initial_state)); state[6] = get_unaligned_le32(iv + 0); state[7] = get_unaligned_le32(iv + 4); } static int salsa20_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize) { static const char sigma[16] = "expand 32-byte k"; static const char tau[16] = "expand 16-byte k"; struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm); const char *constants; if (keysize != SALSA20_MIN_KEY_SIZE && keysize != SALSA20_MAX_KEY_SIZE) return -EINVAL; ctx->initial_state[1] = get_unaligned_le32(key + 0); ctx->initial_state[2] = get_unaligned_le32(key + 4); ctx->initial_state[3] = get_unaligned_le32(key + 8); ctx->initial_state[4] = get_unaligned_le32(key + 12); if (keysize == 32) { /* recommended */ key += 16; constants = sigma; } else { /* keysize == 16 */ constants = tau; } ctx->initial_state[11] = get_unaligned_le32(key + 0); ctx->initial_state[12] = get_unaligned_le32(key + 4); ctx->initial_state[13] = get_unaligned_le32(key + 8); ctx->initial_state[14] = get_unaligned_le32(key + 12); ctx->initial_state[0] = get_unaligned_le32(constants + 0); ctx->initial_state[5] = get_unaligned_le32(constants + 4); ctx->initial_state[10] = get_unaligned_le32(constants + 8); ctx->initial_state[15] = get_unaligned_le32(constants + 12); /* space for the nonce; it will be overridden for each request */ ctx->initial_state[6] = 0; ctx->initial_state[7] = 0; /* initial block number */ ctx->initial_state[8] = 0; ctx->initial_state[9] = 0; return 0; } static int salsa20_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; u32 state[16]; int err; err = skcipher_walk_virt(&walk, req, true); salsa20_init(state, ctx, req->iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; if (nbytes < walk.total) nbytes = round_down(nbytes, walk.stride); salsa20_docrypt(state, walk.dst.virt.addr, walk.src.virt.addr, nbytes); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } static struct skcipher_alg alg = { .base.cra_name = "salsa20", .base.cra_driver_name = "salsa20-generic", .base.cra_priority = 100, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct salsa20_ctx), .base.cra_module = THIS_MODULE, .min_keysize = SALSA20_MIN_KEY_SIZE, .max_keysize = SALSA20_MAX_KEY_SIZE, .ivsize = SALSA20_IV_SIZE, .chunksize = SALSA20_BLOCK_SIZE, .setkey = salsa20_setkey, .encrypt = salsa20_crypt, .decrypt = salsa20_crypt, }; static int __init salsa20_generic_mod_init(void) { return crypto_register_skcipher(&alg); } static void __exit salsa20_generic_mod_fini(void) { crypto_unregister_skcipher(&alg); } module_init(salsa20_generic_mod_init); module_exit(salsa20_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm"); MODULE_ALIAS_CRYPTO("salsa20"); MODULE_ALIAS_CRYPTO("salsa20-generic"); |
2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 | /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * */ #include <linux/slab.h> #include <linux/mISDNif.h> #include <linux/kthread.h> #include <linux/sched.h> #include <linux/sched/cputime.h> #include <linux/signal.h> #include "core.h" static u_int *debug; static inline void _queue_message(struct mISDNstack *st, struct sk_buff *skb) { struct mISDNhead *hh = mISDN_HEAD_P(skb); if (*debug & DEBUG_QUEUE_FUNC) printk(KERN_DEBUG "%s prim(%x) id(%x) %p\n", __func__, hh->prim, hh->id, skb); skb_queue_tail(&st->msgq, skb); if (likely(!test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_set_bit(mISDN_STACK_WORK, &st->status); wake_up_interruptible(&st->workq); } } static int mISDN_queue_message(struct mISDNchannel *ch, struct sk_buff *skb) { _queue_message(ch->st, skb); return 0; } static struct mISDNchannel * get_channel4id(struct mISDNstack *st, u_int id) { struct mISDNchannel *ch; mutex_lock(&st->lmutex); list_for_each_entry(ch, &st->layer2, list) { if (id == ch->nr) goto unlock; } ch = NULL; unlock: mutex_unlock(&st->lmutex); return ch; } static void send_socklist(struct mISDN_sock_list *sl, struct sk_buff *skb) { struct sock *sk; struct sk_buff *cskb = NULL; read_lock(&sl->lock); sk_for_each(sk, &sl->head) { if (sk->sk_state != MISDN_BOUND) continue; if (!cskb) cskb = skb_copy(skb, GFP_ATOMIC); if (!cskb) { printk(KERN_WARNING "%s no skb\n", __func__); break; } if (!sock_queue_rcv_skb(sk, cskb)) cskb = NULL; } read_unlock(&sl->lock); if (cskb) dev_kfree_skb(cskb); } static void send_layer2(struct mISDNstack *st, struct sk_buff *skb) { struct sk_buff *cskb; struct mISDNhead *hh = mISDN_HEAD_P(skb); struct mISDNchannel *ch; int ret; if (!st) return; mutex_lock(&st->lmutex); if ((hh->id & MISDN_ID_ADDR_MASK) == MISDN_ID_ANY) { /* L2 for all */ list_for_each_entry(ch, &st->layer2, list) { if (list_is_last(&ch->list, &st->layer2)) { cskb = skb; skb = NULL; } else { cskb = skb_copy(skb, GFP_KERNEL); } if (cskb) { ret = ch->send(ch, cskb); if (ret) { if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s ch%d prim(%x) addr(%x)" " err %d\n", __func__, ch->nr, hh->prim, ch->addr, ret); dev_kfree_skb(cskb); } } else { printk(KERN_WARNING "%s ch%d addr %x no mem\n", __func__, ch->nr, ch->addr); goto out; } } } else { list_for_each_entry(ch, &st->layer2, list) { if ((hh->id & MISDN_ID_ADDR_MASK) == ch->addr) { ret = ch->send(ch, skb); if (!ret) skb = NULL; goto out; } } ret = st->dev->teimgr->ctrl(st->dev->teimgr, CHECK_DATA, skb); if (!ret) skb = NULL; else if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s mgr prim(%x) err %d\n", __func__, hh->prim, ret); } out: mutex_unlock(&st->lmutex); if (skb) dev_kfree_skb(skb); } static inline int send_msg_to_layer(struct mISDNstack *st, struct sk_buff *skb) { struct mISDNhead *hh = mISDN_HEAD_P(skb); struct mISDNchannel *ch; int lm; lm = hh->prim & MISDN_LAYERMASK; if (*debug & DEBUG_QUEUE_FUNC) printk(KERN_DEBUG "%s prim(%x) id(%x) %p\n", __func__, hh->prim, hh->id, skb); if (lm == 0x1) { if (!hlist_empty(&st->l1sock.head)) { __net_timestamp(skb); send_socklist(&st->l1sock, skb); } return st->layer1->send(st->layer1, skb); } else if (lm == 0x2) { if (!hlist_empty(&st->l1sock.head)) send_socklist(&st->l1sock, skb); send_layer2(st, skb); return 0; } else if (lm == 0x4) { ch = get_channel4id(st, hh->id); if (ch) return ch->send(ch, skb); else printk(KERN_WARNING "%s: dev(%s) prim(%x) id(%x) no channel\n", __func__, dev_name(&st->dev->dev), hh->prim, hh->id); } else if (lm == 0x8) { WARN_ON(lm == 0x8); ch = get_channel4id(st, hh->id); if (ch) return ch->send(ch, skb); else printk(KERN_WARNING "%s: dev(%s) prim(%x) id(%x) no channel\n", __func__, dev_name(&st->dev->dev), hh->prim, hh->id); } else { /* broadcast not handled yet */ printk(KERN_WARNING "%s: dev(%s) prim %x not delivered\n", __func__, dev_name(&st->dev->dev), hh->prim); } return -ESRCH; } static void do_clear_stack(struct mISDNstack *st) { } static int mISDNStackd(void *data) { struct mISDNstack *st = data; #ifdef MISDN_MSG_STATS u64 utime, stime; #endif int err = 0; sigfillset(¤t->blocked); if (*debug & DEBUG_MSG_THREAD) printk(KERN_DEBUG "mISDNStackd %s started\n", dev_name(&st->dev->dev)); if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } for (;;) { struct sk_buff *skb; if (unlikely(test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); } else test_and_set_bit(mISDN_STACK_RUNNING, &st->status); while (test_bit(mISDN_STACK_WORK, &st->status)) { skb = skb_dequeue(&st->msgq); if (!skb) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); /* test if a race happens */ skb = skb_dequeue(&st->msgq); if (!skb) continue; test_and_set_bit(mISDN_STACK_WORK, &st->status); } #ifdef MISDN_MSG_STATS st->msg_cnt++; #endif err = send_msg_to_layer(st, skb); if (unlikely(err)) { if (*debug & DEBUG_SEND_ERR) printk(KERN_DEBUG "%s: %s prim(%x) id(%x) " "send call(%d)\n", __func__, dev_name(&st->dev->dev), mISDN_HEAD_PRIM(skb), mISDN_HEAD_ID(skb), err); dev_kfree_skb(skb); continue; } if (unlikely(test_bit(mISDN_STACK_STOPPED, &st->status))) { test_and_clear_bit(mISDN_STACK_WORK, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); break; } } if (test_bit(mISDN_STACK_CLEARING, &st->status)) { test_and_set_bit(mISDN_STACK_STOPPED, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); do_clear_stack(st); test_and_clear_bit(mISDN_STACK_CLEARING, &st->status); test_and_set_bit(mISDN_STACK_RESTART, &st->status); } if (test_and_clear_bit(mISDN_STACK_RESTART, &st->status)) { test_and_clear_bit(mISDN_STACK_STOPPED, &st->status); test_and_set_bit(mISDN_STACK_RUNNING, &st->status); if (!skb_queue_empty(&st->msgq)) test_and_set_bit(mISDN_STACK_WORK, &st->status); } if (test_bit(mISDN_STACK_ABORT, &st->status)) break; if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } #ifdef MISDN_MSG_STATS st->sleep_cnt++; #endif test_and_clear_bit(mISDN_STACK_ACTIVE, &st->status); wait_event_interruptible(st->workq, (st->status & mISDN_STACK_ACTION_MASK)); if (*debug & DEBUG_MSG_THREAD) printk(KERN_DEBUG "%s: %s wake status %08lx\n", __func__, dev_name(&st->dev->dev), st->status); test_and_set_bit(mISDN_STACK_ACTIVE, &st->status); test_and_clear_bit(mISDN_STACK_WAKEUP, &st->status); if (test_bit(mISDN_STACK_STOPPED, &st->status)) { test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); #ifdef MISDN_MSG_STATS st->stopped_cnt++; #endif } } #ifdef MISDN_MSG_STATS printk(KERN_DEBUG "mISDNStackd daemon for %s proceed %d " "msg %d sleep %d stopped\n", dev_name(&st->dev->dev), st->msg_cnt, st->sleep_cnt, st->stopped_cnt); task_cputime(st->thread, &utime, &stime); printk(KERN_DEBUG "mISDNStackd daemon for %s utime(%llu) stime(%llu)\n", dev_name(&st->dev->dev), utime, stime); printk(KERN_DEBUG "mISDNStackd daemon for %s nvcsw(%ld) nivcsw(%ld)\n", dev_name(&st->dev->dev), st->thread->nvcsw, st->thread->nivcsw); printk(KERN_DEBUG "mISDNStackd daemon for %s killed now\n", dev_name(&st->dev->dev)); #endif test_and_set_bit(mISDN_STACK_KILLED, &st->status); test_and_clear_bit(mISDN_STACK_RUNNING, &st->status); test_and_clear_bit(mISDN_STACK_ACTIVE, &st->status); test_and_clear_bit(mISDN_STACK_ABORT, &st->status); skb_queue_purge(&st->msgq); st->thread = NULL; if (st->notify != NULL) { complete(st->notify); st->notify = NULL; } return 0; } static int l1_receive(struct mISDNchannel *ch, struct sk_buff *skb) { if (!ch->st) return -ENODEV; __net_timestamp(skb); _queue_message(ch->st, skb); return 0; } void set_channel_address(struct mISDNchannel *ch, u_int sapi, u_int tei) { ch->addr = sapi | (tei << 8); } void __add_layer2(struct mISDNchannel *ch, struct mISDNstack *st) { list_add_tail(&ch->list, &st->layer2); } void add_layer2(struct mISDNchannel *ch, struct mISDNstack *st) { mutex_lock(&st->lmutex); __add_layer2(ch, st); mutex_unlock(&st->lmutex); } static int st_own_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg) { if (!ch->st || !ch->st->layer1) return -EINVAL; return ch->st->layer1->ctrl(ch->st->layer1, cmd, arg); } int create_stack(struct mISDNdevice *dev) { struct mISDNstack *newst; int err; DECLARE_COMPLETION_ONSTACK(done); newst = kzalloc(sizeof(struct mISDNstack), GFP_KERNEL); if (!newst) { printk(KERN_ERR "kmalloc mISDN_stack failed\n"); return -ENOMEM; } newst->dev = dev; INIT_LIST_HEAD(&newst->layer2); INIT_HLIST_HEAD(&newst->l1sock.head); rwlock_init(&newst->l1sock.lock); init_waitqueue_head(&newst->workq); skb_queue_head_init(&newst->msgq); mutex_init(&newst->lmutex); dev->D.st = newst; err = create_teimanager(dev); if (err) { printk(KERN_ERR "kmalloc teimanager failed\n"); kfree(newst); return err; } dev->teimgr->peer = &newst->own; dev->teimgr->recv = mISDN_queue_message; dev->teimgr->st = newst; newst->layer1 = &dev->D; dev->D.recv = l1_receive; dev->D.peer = &newst->own; newst->own.st = newst; newst->own.ctrl = st_own_ctrl; newst->own.send = mISDN_queue_message; newst->own.recv = mISDN_queue_message; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s)\n", __func__, dev_name(&newst->dev->dev)); newst->notify = &done; newst->thread = kthread_run(mISDNStackd, (void *)newst, "mISDN_%s", dev_name(&newst->dev->dev)); if (IS_ERR(newst->thread)) { err = PTR_ERR(newst->thread); printk(KERN_ERR "mISDN:cannot create kernel thread for %s (%d)\n", dev_name(&newst->dev->dev), err); delete_teimanager(dev->teimgr); kfree(newst); } else wait_for_completion(&done); return err; } int connect_layer1(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct mISDN_sock *msk = container_of(ch, struct mISDN_sock, ch); struct channel_req rq; int err; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); switch (protocol) { case ISDN_P_NT_S0: case ISDN_P_NT_E1: case ISDN_P_TE_S0: case ISDN_P_TE_E1: ch->recv = mISDN_queue_message; ch->peer = &dev->D.st->own; ch->st = dev->D.st; rq.protocol = protocol; rq.adr.channel = adr->channel; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret %d (dev %d)\n", __func__, err, dev->id); if (err) return err; write_lock_bh(&dev->D.st->l1sock.lock); sk_add_node(&msk->sk, &dev->D.st->l1sock.head); write_unlock_bh(&dev->D.st->l1sock.lock); break; default: return -ENOPROTOOPT; } return 0; } int connect_Bstack(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct channel_req rq, rq2; int pmask, err; struct Bprotocol *bp; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); ch->st = dev->D.st; pmask = 1 << (protocol & ISDN_P_B_MASK); if (pmask & dev->Bprotocols) { rq.protocol = protocol; rq.adr = *adr; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); if (err) return err; ch->recv = rq.ch->send; ch->peer = rq.ch; rq.ch->recv = ch->send; rq.ch->peer = ch; rq.ch->st = dev->D.st; } else { bp = get_Bprotocol4mask(pmask); if (!bp) return -ENOPROTOOPT; rq2.protocol = protocol; rq2.adr = *adr; rq2.ch = ch; err = bp->create(&rq2); if (err) return err; ch->recv = rq2.ch->send; ch->peer = rq2.ch; rq2.ch->st = dev->D.st; rq.protocol = rq2.protocol; rq.adr = *adr; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); if (err) { rq2.ch->ctrl(rq2.ch, CLOSE_CHANNEL, NULL); return err; } rq2.ch->recv = rq.ch->send; rq2.ch->peer = rq.ch; rq.ch->recv = rq2.ch->send; rq.ch->peer = rq2.ch; rq.ch->st = dev->D.st; } ch->protocol = protocol; ch->nr = rq.ch->nr; return 0; } int create_l2entity(struct mISDNdevice *dev, struct mISDNchannel *ch, u_int protocol, struct sockaddr_mISDN *adr) { struct channel_req rq; int err; if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: %s proto(%x) adr(%d %d %d %d)\n", __func__, dev_name(&dev->dev), protocol, adr->dev, adr->channel, adr->sapi, adr->tei); rq.protocol = ISDN_P_TE_S0; if (dev->Dprotocols & (1 << ISDN_P_TE_E1)) rq.protocol = ISDN_P_TE_E1; switch (protocol) { case ISDN_P_LAPD_NT: rq.protocol = ISDN_P_NT_S0; if (dev->Dprotocols & (1 << ISDN_P_NT_E1)) rq.protocol = ISDN_P_NT_E1; /* fall through */ case ISDN_P_LAPD_TE: ch->recv = mISDN_queue_message; ch->peer = &dev->D.st->own; ch->st = dev->D.st; rq.adr.channel = 0; err = dev->D.ctrl(&dev->D, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret 1 %d\n", __func__, err); if (err) break; rq.protocol = protocol; rq.adr = *adr; rq.ch = ch; err = dev->teimgr->ctrl(dev->teimgr, OPEN_CHANNEL, &rq); printk(KERN_DEBUG "%s: ret 2 %d\n", __func__, err); if (!err) { if ((protocol == ISDN_P_LAPD_NT) && !rq.ch) break; add_layer2(rq.ch, dev->D.st); rq.ch->recv = mISDN_queue_message; rq.ch->peer = &dev->D.st->own; rq.ch->ctrl(rq.ch, OPEN_CHANNEL, NULL); /* can't fail */ } break; default: err = -EPROTONOSUPPORT; } return err; } void delete_channel(struct mISDNchannel *ch) { struct mISDN_sock *msk = container_of(ch, struct mISDN_sock, ch); struct mISDNchannel *pch; if (!ch->st) { printk(KERN_WARNING "%s: no stack\n", __func__); return; } if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s) protocol(%x)\n", __func__, dev_name(&ch->st->dev->dev), ch->protocol); if (ch->protocol >= ISDN_P_B_START) { if (ch->peer) { ch->peer->ctrl(ch->peer, CLOSE_CHANNEL, NULL); ch->peer = NULL; } return; } switch (ch->protocol) { case ISDN_P_NT_S0: case ISDN_P_TE_S0: case ISDN_P_NT_E1: case ISDN_P_TE_E1: write_lock_bh(&ch->st->l1sock.lock); sk_del_node_init(&msk->sk); write_unlock_bh(&ch->st->l1sock.lock); ch->st->dev->D.ctrl(&ch->st->dev->D, CLOSE_CHANNEL, NULL); break; case ISDN_P_LAPD_TE: pch = get_channel4id(ch->st, ch->nr); if (pch) { mutex_lock(&ch->st->lmutex); list_del(&pch->list); mutex_unlock(&ch->st->lmutex); pch->ctrl(pch, CLOSE_CHANNEL, NULL); pch = ch->st->dev->teimgr; pch->ctrl(pch, CLOSE_CHANNEL, NULL); } else printk(KERN_WARNING "%s: no l2 channel\n", __func__); break; case ISDN_P_LAPD_NT: pch = ch->st->dev->teimgr; if (pch) { pch->ctrl(pch, CLOSE_CHANNEL, NULL); } else printk(KERN_WARNING "%s: no l2 channel\n", __func__); break; default: break; } return; } void delete_stack(struct mISDNdevice *dev) { struct mISDNstack *st = dev->D.st; DECLARE_COMPLETION_ONSTACK(done); if (*debug & DEBUG_CORE_FUNC) printk(KERN_DEBUG "%s: st(%s)\n", __func__, dev_name(&st->dev->dev)); if (dev->teimgr) delete_teimanager(dev->teimgr); if (st->thread) { if (st->notify) { printk(KERN_WARNING "%s: notifier in use\n", __func__); complete(st->notify); } st->notify = &done; test_and_set_bit(mISDN_STACK_ABORT, &st->status); test_and_set_bit(mISDN_STACK_WAKEUP, &st->status); wake_up_interruptible(&st->workq); wait_for_completion(&done); } if (!list_empty(&st->layer2)) printk(KERN_WARNING "%s: layer2 list not empty\n", __func__); if (!hlist_empty(&st->l1sock.head)) printk(KERN_WARNING "%s: layer1 list not empty\n", __func__); kfree(st); } void mISDN_initstack(u_int *dp) { debug = dp; } |
8 8 8 5 5 5 5 1 1 6 5 8 8 2 14 14 14 14 14 14 14 15 15 14 14 12 12 12 10 9 7 9 9 11 12 6 6 6 2 4 3 5 6 2 1 1 1 1 1 1 2 2 2 1 1 1 1 4 4 3 5 5 4 3 2 2 2 1 2 3 5 10 10 5 8 8 8 1 1 1 1 1 1 9 9 1 7 4 3 3 2 2 1 1 8 9 2 1 1 1 1 2 12 12 2 10 10 2 2 6 5 9 12 28 28 28 9 9 8 9 9 6 6 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 | /* RFCOMM implementation for Linux Bluetooth stack (BlueZ). Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com> Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* * RFCOMM sockets. */ #include <linux/export.h> #include <linux/debugfs.h> #include <linux/sched/signal.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/rfcomm.h> static const struct proto_ops rfcomm_sock_ops; static struct bt_sock_list rfcomm_sk_list = { .lock = __RW_LOCK_UNLOCKED(rfcomm_sk_list.lock) }; static void rfcomm_sock_close(struct sock *sk); static void rfcomm_sock_kill(struct sock *sk); /* ---- DLC callbacks ---- * * called under rfcomm_dlc_lock() */ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb) { struct sock *sk = d->owner; if (!sk) return; atomic_add(skb->len, &sk->sk_rmem_alloc); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) rfcomm_dlc_throttle(d); } static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) { struct sock *sk = d->owner, *parent; unsigned long flags; if (!sk) return; BT_DBG("dlc %p state %ld err %d", d, d->state, err); local_irq_save(flags); bh_lock_sock(sk); if (err) sk->sk_err = err; sk->sk_state = d->state; parent = bt_sk(sk)->parent; if (parent) { if (d->state == BT_CLOSED) { sock_set_flag(sk, SOCK_ZAPPED); bt_accept_unlink(sk); } parent->sk_data_ready(parent); } else { if (d->state == BT_CONNECTED) rfcomm_session_getaddr(d->session, &rfcomm_pi(sk)->src, NULL); sk->sk_state_change(sk); } bh_unlock_sock(sk); local_irq_restore(flags); if (parent && sock_flag(sk, SOCK_ZAPPED)) { /* We have to drop DLC lock here, otherwise * rfcomm_sock_destruct() will dead lock. */ rfcomm_dlc_unlock(d); rfcomm_sock_kill(sk); rfcomm_dlc_lock(d); } } /* ---- Socket functions ---- */ static struct sock *__rfcomm_get_listen_sock_by_addr(u8 channel, bdaddr_t *src) { struct sock *sk = NULL; sk_for_each(sk, &rfcomm_sk_list.head) { if (rfcomm_pi(sk)->channel != channel) continue; if (bacmp(&rfcomm_pi(sk)->src, src)) continue; if (sk->sk_state == BT_BOUND || sk->sk_state == BT_LISTEN) break; } return sk ? sk : NULL; } /* Find socket with channel and source bdaddr. * Returns closest match. */ static struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *src) { struct sock *sk = NULL, *sk1 = NULL; read_lock(&rfcomm_sk_list.lock); sk_for_each(sk, &rfcomm_sk_list.head) { if (state && sk->sk_state != state) continue; if (rfcomm_pi(sk)->channel == channel) { /* Exact match. */ if (!bacmp(&rfcomm_pi(sk)->src, src)) break; /* Closest match */ if (!bacmp(&rfcomm_pi(sk)->src, BDADDR_ANY)) sk1 = sk; } } read_unlock(&rfcomm_sk_list.lock); return sk ? sk : sk1; } static void rfcomm_sock_destruct(struct sock *sk) { struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; BT_DBG("sk %p dlc %p", sk, d); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); rfcomm_dlc_lock(d); rfcomm_pi(sk)->dlc = NULL; /* Detach DLC if it's owned by this socket */ if (d->owner == sk) d->owner = NULL; rfcomm_dlc_unlock(d); rfcomm_dlc_put(d); } static void rfcomm_sock_cleanup_listen(struct sock *parent) { struct sock *sk; BT_DBG("parent %p", parent); /* Close not yet accepted dlcs */ while ((sk = bt_accept_dequeue(parent, NULL))) { rfcomm_sock_close(sk); rfcomm_sock_kill(sk); } parent->sk_state = BT_CLOSED; sock_set_flag(parent, SOCK_ZAPPED); } /* Kill socket (only if zapped and orphan) * Must be called on unlocked socket. */ static void rfcomm_sock_kill(struct sock *sk) { if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) return; BT_DBG("sk %p state %d refcnt %d", sk, sk->sk_state, refcount_read(&sk->sk_refcnt)); /* Kill poor orphan */ bt_sock_unlink(&rfcomm_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); sock_put(sk); } static void __rfcomm_sock_close(struct sock *sk) { struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); switch (sk->sk_state) { case BT_LISTEN: rfcomm_sock_cleanup_listen(sk); break; case BT_CONNECT: case BT_CONNECT2: case BT_CONFIG: case BT_CONNECTED: rfcomm_dlc_close(d, 0); /* fall through */ default: sock_set_flag(sk, SOCK_ZAPPED); break; } } /* Close socket. * Must be called on unlocked socket. */ static void rfcomm_sock_close(struct sock *sk) { lock_sock(sk); __rfcomm_sock_close(sk); release_sock(sk); } static void rfcomm_sock_init(struct sock *sk, struct sock *parent) { struct rfcomm_pinfo *pi = rfcomm_pi(sk); BT_DBG("sk %p", sk); if (parent) { sk->sk_type = parent->sk_type; pi->dlc->defer_setup = test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags); pi->sec_level = rfcomm_pi(parent)->sec_level; pi->role_switch = rfcomm_pi(parent)->role_switch; security_sk_clone(parent, sk); } else { pi->dlc->defer_setup = 0; pi->sec_level = BT_SECURITY_LOW; pi->role_switch = 0; } pi->dlc->sec_level = pi->sec_level; pi->dlc->role_switch = pi->role_switch; } static struct proto rfcomm_proto = { .name = "RFCOMM", .owner = THIS_MODULE, .obj_size = sizeof(struct rfcomm_pinfo) }; static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct rfcomm_dlc *d; struct sock *sk; sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto, kern); if (!sk) return NULL; sock_init_data(sock, sk); INIT_LIST_HEAD(&bt_sk(sk)->accept_q); d = rfcomm_dlc_alloc(prio); if (!d) { sk_free(sk); return NULL; } d->data_ready = rfcomm_sk_data_ready; d->state_change = rfcomm_sk_state_change; rfcomm_pi(sk)->dlc = d; d->owner = sk; sk->sk_destruct = rfcomm_sock_destruct; sk->sk_sndtimeo = RFCOMM_CONN_TIMEOUT; sk->sk_sndbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10; sk->sk_rcvbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10; sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = proto; sk->sk_state = BT_OPEN; bt_sock_link(&rfcomm_sk_list, sk); BT_DBG("sk %p", sk); return sk; } static int rfcomm_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); sock->state = SS_UNCONNECTED; if (sock->type != SOCK_STREAM && sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; sock->ops = &rfcomm_sock_ops; sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; rfcomm_sock_init(sk, NULL); return 0; } static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_rc sa; struct sock *sk = sock->sk; int len, err = 0; if (!addr || addr_len < offsetofend(struct sockaddr, sa_family) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; memset(&sa, 0, sizeof(sa)); len = min_t(unsigned int, sizeof(sa), addr_len); memcpy(&sa, addr, len); BT_DBG("sk %p %pMR", sk, &sa.rc_bdaddr); lock_sock(sk); if (sk->sk_state != BT_OPEN) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_STREAM) { err = -EINVAL; goto done; } write_lock(&rfcomm_sk_list.lock); if (sa.rc_channel && __rfcomm_get_listen_sock_by_addr(sa.rc_channel, &sa.rc_bdaddr)) { err = -EADDRINUSE; } else { /* Save source address */ bacpy(&rfcomm_pi(sk)->src, &sa.rc_bdaddr); rfcomm_pi(sk)->channel = sa.rc_channel; sk->sk_state = BT_BOUND; } write_unlock(&rfcomm_sk_list.lock); done: release_sock(sk); return err; } static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; struct sock *sk = sock->sk; struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; int err = 0; BT_DBG("sk %p", sk); if (alen < sizeof(struct sockaddr_rc) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; lock_sock(sk); if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_STREAM) { err = -EINVAL; goto done; } sk->sk_state = BT_CONNECT; bacpy(&rfcomm_pi(sk)->dst, &sa->rc_bdaddr); rfcomm_pi(sk)->channel = sa->rc_channel; d->sec_level = rfcomm_pi(sk)->sec_level; d->role_switch = rfcomm_pi(sk)->role_switch; err = rfcomm_dlc_open(d, &rfcomm_pi(sk)->src, &sa->rc_bdaddr, sa->rc_channel); if (!err) err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); done: release_sock(sk); return err; } static int rfcomm_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sk %p backlog %d", sk, backlog); lock_sock(sk); if (sk->sk_state != BT_BOUND) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_STREAM) { err = -EINVAL; goto done; } if (!rfcomm_pi(sk)->channel) { bdaddr_t *src = &rfcomm_pi(sk)->src; u8 channel; err = -EINVAL; write_lock(&rfcomm_sk_list.lock); for (channel = 1; channel < 31; channel++) if (!__rfcomm_get_listen_sock_by_addr(channel, src)) { rfcomm_pi(sk)->channel = channel; err = 0; break; } write_unlock(&rfcomm_sk_list.lock); if (err < 0) goto done; } sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = BT_LISTEN; done: release_sock(sk); return err; } static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *nsk; long timeo; int err = 0; lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (sk->sk_type != SOCK_STREAM) { err = -EINVAL; goto done; } timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); /* Wait for an incoming connection. (wake-one). */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (1) { if (sk->sk_state != BT_LISTEN) { err = -EBADFD; break; } nsk = bt_accept_dequeue(sk, newsock); if (nsk) break; if (!timeo) { err = -EAGAIN; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock_nested(sk, SINGLE_DEPTH_NESTING); } remove_wait_queue(sk_sleep(sk), &wait); if (err) goto done; newsock->state = SS_CONNECTED; BT_DBG("new socket %p", nsk); done: release_sock(sk); return err; } static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; struct sock *sk = sock->sk; BT_DBG("sock %p, sk %p", sock, sk); if (peer && sk->sk_state != BT_CONNECTED && sk->sk_state != BT_CONNECT && sk->sk_state != BT_CONNECT2) return -ENOTCONN; memset(sa, 0, sizeof(*sa)); sa->rc_family = AF_BLUETOOTH; sa->rc_channel = rfcomm_pi(sk)->channel; if (peer) bacpy(&sa->rc_bdaddr, &rfcomm_pi(sk)->dst); else bacpy(&sa->rc_bdaddr, &rfcomm_pi(sk)->src); return sizeof(struct sockaddr_rc); } static int rfcomm_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; struct sk_buff *skb; int sent; if (test_bit(RFCOMM_DEFER_SETUP, &d->flags)) return -ENOTCONN; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (sk->sk_shutdown & SEND_SHUTDOWN) return -EPIPE; BT_DBG("sock %p, sk %p", sock, sk); lock_sock(sk); sent = bt_sock_wait_ready(sk, msg->msg_flags); release_sock(sk); if (sent) return sent; skb = bt_skb_sendmmsg(sk, msg, len, d->mtu, RFCOMM_SKB_HEAD_RESERVE, RFCOMM_SKB_TAIL_RESERVE); if (IS_ERR(skb)) return PTR_ERR(skb); sent = rfcomm_dlc_send(d, skb); if (sent < 0) kfree_skb(skb); return sent; } static int rfcomm_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; int len; if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { rfcomm_dlc_accept(d); return 0; } len = bt_sock_stream_recvmsg(sock, msg, size, flags); lock_sock(sk); if (!(flags & MSG_PEEK) && len > 0) atomic_sub(len, &sk->sk_rmem_alloc); if (atomic_read(&sk->sk_rmem_alloc) <= (sk->sk_rcvbuf >> 2)) rfcomm_dlc_unthrottle(rfcomm_pi(sk)->dlc); release_sock(sk); return len; } static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen) { struct sock *sk = sock->sk; int err = 0; u32 opt; BT_DBG("sk %p", sk); lock_sock(sk); switch (optname) { case RFCOMM_LM: if (get_user(opt, (u32 __user *) optval)) { err = -EFAULT; break; } if (opt & RFCOMM_LM_FIPS) { err = -EINVAL; break; } if (opt & RFCOMM_LM_AUTH) rfcomm_pi(sk)->sec_level = BT_SECURITY_LOW; if (opt & RFCOMM_LM_ENCRYPT) rfcomm_pi(sk)->sec_level = BT_SECURITY_MEDIUM; if (opt & RFCOMM_LM_SECURE) rfcomm_pi(sk)->sec_level = BT_SECURITY_HIGH; rfcomm_pi(sk)->role_switch = (opt & RFCOMM_LM_MASTER); break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { struct sock *sk = sock->sk; struct bt_security sec; int err = 0; size_t len; u32 opt; BT_DBG("sk %p", sk); if (level == SOL_RFCOMM) return rfcomm_sock_setsockopt_old(sock, optname, optval, optlen); if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; lock_sock(sk); switch (optname) { case BT_SECURITY: if (sk->sk_type != SOCK_STREAM) { err = -EINVAL; break; } sec.level = BT_SECURITY_LOW; len = min_t(unsigned int, sizeof(sec), optlen); if (copy_from_user((char *) &sec, optval, len)) { err = -EFAULT; break; } if (sec.level > BT_SECURITY_HIGH) { err = -EINVAL; break; } rfcomm_pi(sk)->sec_level = sec.level; break; case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } if (get_user(opt, (u32 __user *) optval)) { err = -EFAULT; break; } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); else clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct sock *l2cap_sk; struct l2cap_conn *conn; struct rfcomm_conninfo cinfo; int len, err = 0; u32 opt; BT_DBG("sk %p", sk); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case RFCOMM_LM: switch (rfcomm_pi(sk)->sec_level) { case BT_SECURITY_LOW: opt = RFCOMM_LM_AUTH; break; case BT_SECURITY_MEDIUM: opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT; break; case BT_SECURITY_HIGH: opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE; break; case BT_SECURITY_FIPS: opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE | RFCOMM_LM_FIPS; break; default: opt = 0; break; } if (rfcomm_pi(sk)->role_switch) opt |= RFCOMM_LM_MASTER; if (put_user(opt, (u32 __user *) optval)) err = -EFAULT; break; case RFCOMM_CONNINFO: if (sk->sk_state != BT_CONNECTED && !rfcomm_pi(sk)->dlc->defer_setup) { err = -ENOTCONN; break; } l2cap_sk = rfcomm_pi(sk)->dlc->session->sock->sk; conn = l2cap_pi(l2cap_sk)->chan->conn; memset(&cinfo, 0, sizeof(cinfo)); cinfo.hci_handle = conn->hcon->handle; memcpy(cinfo.dev_class, conn->hcon->dev_class, 3); len = min_t(unsigned int, len, sizeof(cinfo)); if (copy_to_user(optval, (char *) &cinfo, len)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct bt_security sec; int len, err = 0; BT_DBG("sk %p", sk); if (level == SOL_RFCOMM) return rfcomm_sock_getsockopt_old(sock, optname, optval, optlen); if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case BT_SECURITY: if (sk->sk_type != SOCK_STREAM) { err = -EINVAL; break; } sec.level = rfcomm_pi(sk)->sec_level; sec.key_size = 0; len = min_t(unsigned int, len, sizeof(sec)); if (copy_to_user(optval, (char *) &sec, len)) err = -EFAULT; break; case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags), (u32 __user *) optval)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk __maybe_unused = sock->sk; int err; BT_DBG("sk %p cmd %x arg %lx", sk, cmd, arg); err = bt_sock_ioctl(sock, cmd, arg); if (err == -ENOIOCTLCMD) { #ifdef CONFIG_BT_RFCOMM_TTY lock_sock(sk); err = rfcomm_dev_ioctl(sk, cmd, (void __user *) arg); release_sock(sk); #else err = -EOPNOTSUPP; #endif } return err; } static int rfcomm_sock_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; lock_sock(sk); if (!sk->sk_shutdown) { sk->sk_shutdown = SHUTDOWN_MASK; __rfcomm_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && !(current->flags & PF_EXITING)) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); } release_sock(sk); return err; } static int rfcomm_sock_release(struct socket *sock) { struct sock *sk = sock->sk; int err; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; err = rfcomm_sock_shutdown(sock, 2); sock_orphan(sk); rfcomm_sock_kill(sk); return err; } /* ---- RFCOMM core layer callbacks ---- * * called under rfcomm_lock() */ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc **d) { struct sock *sk, *parent; bdaddr_t src, dst; int result = 0; BT_DBG("session %p channel %d", s, channel); rfcomm_session_getaddr(s, &src, &dst); /* Check if we have socket listening on channel */ parent = rfcomm_get_sock_by_channel(BT_LISTEN, channel, &src); if (!parent) return 0; bh_lock_sock(parent); /* Check for backlog size */ if (sk_acceptq_is_full(parent)) { BT_DBG("backlog full %d", parent->sk_ack_backlog); goto done; } sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC, 0); if (!sk) goto done; bt_sock_reclassify_lock(sk, BTPROTO_RFCOMM); rfcomm_sock_init(sk, parent); bacpy(&rfcomm_pi(sk)->src, &src); bacpy(&rfcomm_pi(sk)->dst, &dst); rfcomm_pi(sk)->channel = channel; sk->sk_state = BT_CONFIG; bt_accept_enqueue(parent, sk, true); /* Accept connection and return socket DLC */ *d = rfcomm_pi(sk)->dlc; result = 1; done: bh_unlock_sock(parent); if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) parent->sk_state_change(parent); return result; } static int rfcomm_sock_debugfs_show(struct seq_file *f, void *p) { struct sock *sk; read_lock(&rfcomm_sk_list.lock); sk_for_each(sk, &rfcomm_sk_list.head) { seq_printf(f, "%pMR %pMR %d %d\n", &rfcomm_pi(sk)->src, &rfcomm_pi(sk)->dst, sk->sk_state, rfcomm_pi(sk)->channel); } read_unlock(&rfcomm_sk_list.lock); return 0; } static int rfcomm_sock_debugfs_open(struct inode *inode, struct file *file) { return single_open(file, rfcomm_sock_debugfs_show, inode->i_private); } static const struct file_operations rfcomm_sock_debugfs_fops = { .open = rfcomm_sock_debugfs_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; static struct dentry *rfcomm_sock_debugfs; static const struct proto_ops rfcomm_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = rfcomm_sock_release, .bind = rfcomm_sock_bind, .connect = rfcomm_sock_connect, .listen = rfcomm_sock_listen, .accept = rfcomm_sock_accept, .getname = rfcomm_sock_getname, .sendmsg = rfcomm_sock_sendmsg, .recvmsg = rfcomm_sock_recvmsg, .shutdown = rfcomm_sock_shutdown, .setsockopt = rfcomm_sock_setsockopt, .getsockopt = rfcomm_sock_getsockopt, .ioctl = rfcomm_sock_ioctl, .poll = bt_sock_poll, .socketpair = sock_no_socketpair, .mmap = sock_no_mmap }; static const struct net_proto_family rfcomm_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = rfcomm_sock_create }; int __init rfcomm_init_sockets(void) { int err; BUILD_BUG_ON(sizeof(struct sockaddr_rc) > sizeof(struct sockaddr)); err = proto_register(&rfcomm_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_RFCOMM, &rfcomm_sock_family_ops); if (err < 0) { BT_ERR("RFCOMM socket layer registration failed"); goto error; } err = bt_procfs_init(&init_net, "rfcomm", &rfcomm_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create RFCOMM proc file"); bt_sock_unregister(BTPROTO_RFCOMM); goto error; } BT_INFO("RFCOMM socket layer initialized"); if (IS_ERR_OR_NULL(bt_debugfs)) return 0; rfcomm_sock_debugfs = debugfs_create_file("rfcomm", 0444, bt_debugfs, NULL, &rfcomm_sock_debugfs_fops); return 0; error: proto_unregister(&rfcomm_proto); return err; } void __exit rfcomm_cleanup_sockets(void) { bt_procfs_cleanup(&init_net, "rfcomm"); debugfs_remove(rfcomm_sock_debugfs); bt_sock_unregister(BTPROTO_RFCOMM); proto_unregister(&rfcomm_proto); } |
97 97 99 90 88 87 87 9 19 64 63 52 81 78 90 2 9 169 98 1 167 22 160 176 1 1 6 2 1 4 3 6 1 535 535 535 535 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 | /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * (C) 2006-2010 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include <linux/types.h> #include <linux/timer.h> #include <linux/netfilter.h> #include <linux/in.h> #include <linux/icmp.h> #include <linux/seq_file.h> #include <net/ip.h> #include <net/checksum.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> static const unsigned int nf_ct_icmp_timeout = 30*HZ; static inline struct nf_icmp_net *icmp_pernet(struct net *net) { return &net->ct.nf_ct_proto.icmp; } static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { const struct icmphdr *hp; struct icmphdr _hdr; hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hp == NULL) return false; tuple->dst.u.icmp.type = hp->type; tuple->src.u.icmp.id = hp->un.echo.id; tuple->dst.u.icmp.code = hp->code; return true; } /* Add 1; spaces filled with 0. */ static const u_int8_t invmap[] = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, [ICMP_ECHOREPLY] = ICMP_ECHO + 1, [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1 }; static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { if (orig->dst.u.icmp.type >= sizeof(invmap) || !invmap[orig->dst.u.icmp.type]) return false; tuple->src.u.icmp.id = orig->src.u.icmp.id; tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; tuple->dst.u.icmp.code = orig->dst.u.icmp.code; return true; } static unsigned int *icmp_get_timeouts(struct net *net) { return &icmp_pernet(net)->timeout; } /* Returns verdict for packet, or -1 for invalid. */ static int icmp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo) { /* Do not immediately delete the connection after the first successful reply to avoid excessive conntrackd traffic and also to handle correctly ICMP echo reply duplicates. */ unsigned int *timeout = nf_ct_timeout_lookup(ct); if (!timeout) timeout = icmp_get_timeouts(nf_ct_net(ct)); nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff) { static const u_int8_t valid_new[] = { [ICMP_ECHO] = 1, [ICMP_TIMESTAMP] = 1, [ICMP_INFO_REQUEST] = 1, [ICMP_ADDRESS] = 1 }; if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ pr_debug("icmp: can't create new conn with type %u\n", ct->tuplehash[0].tuple.dst.u.icmp.type); nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple); return false; } return true; } /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ static int icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int hooknum) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_l4proto *innerproto; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_zone *zone; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; WARN_ON(skb_nfct(skb)); zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb) + ip_hdrlen(skb) + sizeof(struct icmphdr), PF_INET, net, &origtuple)) { pr_debug("icmp_error_message: failed to get tuple\n"); return -NF_ACCEPT; } /* rcu_read_lock()ed by nf_hook_thresh */ innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum); /* Ordinarily, we'd expect the inverted tupleproto, but it's been preserved inside the ICMP. */ if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; } ctinfo = IP_CT_RELATED; h = nf_conntrack_find_get(net, zone, &innertuple); if (!h) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; } if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) ctinfo += IP_CT_IS_REPLY; /* Update skb to refer to this connection */ nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); return NF_ACCEPT; } static void icmp_error_log(const struct sk_buff *skb, struct net *net, u8 pf, const char *msg) { nf_l4proto_log_invalid(skb, net, pf, IPPROTO_ICMP, "%s", msg); } /* Small and modified version of icmp_rcv */ static int icmp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u8 pf, unsigned int hooknum) { const struct icmphdr *icmph; struct icmphdr _ih; /* Not enough header? */ icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); if (icmph == NULL) { icmp_error_log(skb, net, pf, "short packet"); return -NF_ACCEPT; } /* See ip_conntrack_proto_tcp.c */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { icmp_error_log(skb, net, pf, "bad hw icmp checksum"); return -NF_ACCEPT; } /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently * discarded. */ if (icmph->type > NR_ICMP_TYPES) { icmp_error_log(skb, net, pf, "invalid icmp type"); return -NF_ACCEPT; } /* Need to track icmp error message? */ if (icmph->type != ICMP_DEST_UNREACH && icmph->type != ICMP_SOURCE_QUENCH && icmph->type != ICMP_TIME_EXCEEDED && icmph->type != ICMP_PARAMETERPROB && icmph->type != ICMP_REDIRECT) return NF_ACCEPT; return icmp_error_message(net, tmpl, skb, hooknum); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static int icmp_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *t) { if (nla_put_be16(skb, CTA_PROTO_ICMP_ID, t->src.u.icmp.id) || nla_put_u8(skb, CTA_PROTO_ICMP_TYPE, t->dst.u.icmp.type) || nla_put_u8(skb, CTA_PROTO_ICMP_CODE, t->dst.u.icmp.code)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_ICMP_TYPE] = { .type = NLA_U8 }, [CTA_PROTO_ICMP_CODE] = { .type = NLA_U8 }, [CTA_PROTO_ICMP_ID] = { .type = NLA_U16 }, }; static int icmp_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple) { if (!tb[CTA_PROTO_ICMP_TYPE] || !tb[CTA_PROTO_ICMP_CODE] || !tb[CTA_PROTO_ICMP_ID]) return -EINVAL; tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); if (tuple->dst.u.icmp.type >= sizeof(invmap) || !invmap[tuple->dst.u.icmp.type]) return -EINVAL; return 0; } static unsigned int icmp_nlattr_tuple_size(void) { static unsigned int size __read_mostly; if (!size) size = nla_policy_len(icmp_nla_policy, CTA_PROTO_MAX + 1); return size; } #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeout = data; struct nf_icmp_net *in = icmp_pernet(net); if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) { if (!timeout) timeout = &in->timeout; *timeout = ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ; } else if (timeout) { /* Set default ICMP timeout. */ *timeout = in->timeout; } return 0; } static int icmp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeout = data; if (nla_put_be32(skb, CTA_TIMEOUT_ICMP_TIMEOUT, htonl(*timeout / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy icmp_timeout_nla_policy[CTA_TIMEOUT_ICMP_MAX+1] = { [CTA_TIMEOUT_ICMP_TIMEOUT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_SYSCTL static struct ctl_table icmp_sysctl_table[] = { { .procname = "nf_conntrack_icmp_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { } }; #endif /* CONFIG_SYSCTL */ static int icmp_kmemdup_sysctl_table(struct nf_proto_net *pn, struct nf_icmp_net *in) { #ifdef CONFIG_SYSCTL pn->ctl_table = kmemdup(icmp_sysctl_table, sizeof(icmp_sysctl_table), GFP_KERNEL); if (!pn->ctl_table) return -ENOMEM; pn->ctl_table[0].data = &in->timeout; #endif return 0; } static int icmp_init_net(struct net *net, u_int16_t proto) { struct nf_icmp_net *in = icmp_pernet(net); struct nf_proto_net *pn = &in->pn; in->timeout = nf_ct_icmp_timeout; return icmp_kmemdup_sysctl_table(pn, in); } static struct nf_proto_net *icmp_get_net_proto(struct net *net) { return &net->ct.nf_ct_proto.icmp.pn; } const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { .l3proto = PF_INET, .l4proto = IPPROTO_ICMP, .pkt_to_tuple = icmp_pkt_to_tuple, .invert_tuple = icmp_invert_tuple, .packet = icmp_packet, .new = icmp_new, .error = icmp_error, .destroy = NULL, .me = NULL, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_tuple_size = icmp_nlattr_tuple_size, .nlattr_to_tuple = icmp_nlattr_to_tuple, .nla_policy = icmp_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = icmp_timeout_nlattr_to_obj, .obj_to_nlattr = icmp_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_ICMP_MAX, .obj_size = sizeof(unsigned int), .nla_policy = icmp_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ .init_net = icmp_init_net, .get_net_proto = icmp_get_net_proto, }; |
57 29 26 29 29 28 27 26 25 14 15 2 24 25 24 23 23 23 1 1 1 30 30 22 22 22 22 22 22 12 12 12 22 22 22 22 3 21 22 22 22 22 22 22 22 22 22 22 10 9 9 10 16 10 10 10 10 10 10 10 6 6 6 6 5 5 22 22 22 22 4 4 4 6 22 22 22 16 16 15 15 15 22 11 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 | /* * kexec.c - kexec system call core code. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/ioport.h> #include <linux/hardirq.h> #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/utsname.h> #include <linux/numa.h> #include <linux/suspend.h> #include <linux/device.h> #include <linux/freezer.h> #include <linux/pm.h> #include <linux/cpu.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/console.h> #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/syscore_ops.h> #include <linux/compiler.h> #include <linux/hugetlb.h> #include <linux/frame.h> #include <asm/page.h> #include <asm/sections.h> #include <crypto/hash.h> #include <crypto/sha.h> #include "kexec_internal.h" DEFINE_MUTEX(kexec_mutex); /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", .start = 0, .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, .desc = IORES_DESC_CRASH_KERNEL }; struct resource crashk_low_res = { .name = "Crash kernel", .start = 0, .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, .desc = IORES_DESC_CRASH_KERNEL }; int kexec_should_crash(struct task_struct *p) { /* * If crash_kexec_post_notifiers is enabled, don't run * crash_kexec() here yet, which must be run after panic * notifiers in panic(). */ if (crash_kexec_post_notifiers) return 0; /* * There are 4 panic() calls in do_exit() path, each of which * corresponds to each of these 4 conditions. */ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) return 1; return 0; } int kexec_crash_loaded(void) { return !!kexec_crash_image; } EXPORT_SYMBOL_GPL(kexec_crash_loaded); /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors * where you can disable the MMU this is trivial, and easy. For * others it is still a simple predictable page table to setup. * * In that environment kexec copies the new kernel to its final * resting place. This means I can only support memory whose * physical address can fit in an unsigned long. In particular * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. * If the assembly stub has more restrictive requirements * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be * defined more restrictively in <asm/kexec.h>. * * The code for the transition from the current kernel to the * the new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from * virtual to physical addresses it must live in the range * 0 - TASK_SIZE, as only the user space mappings are arbitrarily * modifiable. * * The assembly stub in the control code buffer is passed a linked list * of descriptor pages detailing the source pages of the new kernel, * and the destination addresses of those source pages. As this data * structure is not used in the context of the current OS, it must * be self-contained. * * The code has been made to work with highmem pages and will use a * destination page in its final resting place (if it happens * to allocate it). The end product of this is that most of the * physical address space, and most of RAM can be used. * * Future directions include: * - allocating a page table with the control code buffer identity * mapped, to simplify machine_kexec and make kexec_on_panic more * reliable. */ /* * KIMAGE_NO_DEST is an impossible destination address..., for * allocating pages whose destination address we do not care about. */ #define KIMAGE_NO_DEST (-1UL) #define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); int sanity_check_segment_list(struct kimage *image) { int i; unsigned long nr_segments = image->nr_segments; unsigned long total_pages = 0; /* * Verify we have good destination addresses. The caller is * responsible for making certain we don't attempt to load * the new image into invalid or reserved areas of RAM. This * just verifies it is an address we can use. * * Since the kernel does everything in page size chunks ensure * the destination addresses are page aligned. Too many * special cases crop of when we don't do this. The most * insidious is getting overlapping destination addresses * simply because addresses are changed to page size * granularity. */ for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if (mstart > mend) return -EADDRNOTAVAIL; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) return -EADDRNOTAVAIL; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) return -EADDRNOTAVAIL; } /* Verify our destination addresses do not overlap. * If we alloed overlapping destination addresses * through very weird things can happen with no * easy explanation as one segment stops on another. */ for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; unsigned long j; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; for (j = 0; j < i; j++) { unsigned long pstart, pend; pstart = image->segment[j].mem; pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) return -EINVAL; } } /* Ensure our buffer sizes are strictly less than * our memory sizes. This should always be the case, * and it is easier to check up front than to be surprised * later on. */ for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) return -EINVAL; } /* * Verify that no more than half of memory will be consumed. If the * request from userspace is too large, a large amount of time will be * wasted allocating pages, which can cause a soft lockup. */ for (i = 0; i < nr_segments; i++) { if (PAGE_COUNT(image->segment[i].memsz) > totalram_pages / 2) return -EINVAL; total_pages += PAGE_COUNT(image->segment[i].memsz); } if (total_pages > totalram_pages / 2) return -EINVAL; /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't * attempt to load the new image into invalid or reserved * areas of RAM. But crash kernels are preloaded into a * reserved area of ram. We must ensure the addresses * are in the reserved area otherwise preloading the * kernel could corrupt things. */ if (image->type == KEXEC_TYPE_CRASH) { for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ if ((mstart < phys_to_boot_phys(crashk_res.start)) || (mend > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } } return 0; } struct kimage *do_kimage_alloc_init(void) { struct kimage *image; /* Allocate a controlling structure */ image = kzalloc(sizeof(*image), GFP_KERNEL); if (!image) return NULL; image->head = 0; image->entry = &image->head; image->last_entry = &image->head; image->control_page = ~0; /* By default this does not apply */ image->type = KEXEC_TYPE_DEFAULT; /* Initialize the list of control pages */ INIT_LIST_HEAD(&image->control_pages); /* Initialize the list of destination pages */ INIT_LIST_HEAD(&image->dest_pages); /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unusable_pages); return image; } int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end) { unsigned long i; for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((end > mstart) && (start < mend)) return 1; } return 0; } static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; if (fatal_signal_pending(current)) return NULL; pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; pages->mapping = NULL; set_page_private(pages, order); count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); arch_kexec_post_alloc_pages(page_address(pages), count, gfp_mask); if (gfp_mask & __GFP_ZERO) for (i = 0; i < count; i++) clear_highpage(pages + i); } return pages; } static void kimage_free_pages(struct page *page) { unsigned int order, count, i; order = page_private(page); count = 1 << order; arch_kexec_pre_free_pages(page_address(page), count); for (i = 0; i < count; i++) ClearPageReserved(page + i); __free_pages(page, order); } void kimage_free_page_list(struct list_head *list) { struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) { list_del(&page->lru); kimage_free_pages(page); } } static struct page *kimage_alloc_normal_control_pages(struct kimage *image, unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages * to their final resting place. As such they must * not conflict with either the destination addresses * or memory the kernel is already using. * * The only case where we really need more than one of * these are for architectures where we cannot disable * the MMU and must instead generate an identity mapped * page table for all of the memory. * * At worst this runs in O(N) of the image size. */ struct list_head extra_pages; struct page *pages; unsigned int count; count = 1 << order; INIT_LIST_HEAD(&extra_pages); /* Loop while I can allocate a page and the page allocated * is a destination page. */ do { unsigned long pfn, epfn, addr, eaddr; pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); if (!pages) break; pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = epfn << PAGE_SHIFT; if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || kimage_is_destination_range(image, addr, eaddr)) { list_add(&pages->lru, &extra_pages); pages = NULL; } } while (!pages); if (pages) { /* Remember the allocated page... */ list_add(&pages->lru, &image->control_pages); /* Because the page is already in it's destination * location we will never allocate another page at * that address. Therefore kimage_alloc_pages * will not return it (again) and we don't need * to give it an entry in image->segment[]. */ } /* Deal with the destination pages I have inadvertently allocated. * * Ideally I would convert multi-page allocations into single * page allocations, and add everything to image->dest_pages. * * For now it is simpler to just free the pages. */ kimage_free_page_list(&extra_pages); return pages; } static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages * to their final resting place. As such they must * not conflict with either the destination addresses * or memory the kernel is already using. * * Control pages are also the only pags we must allocate * when loading a crash kernel. All of the other pages * are specified by the segments and we just memcpy * into them directly. * * The only case where we really need more than one of * these are for architectures where we cannot disable * the MMU and must instead generate an identity mapped * page table for all of the memory. * * Given the low demand this implements a very simple * allocator that finds the first hole of the appropriate * size in the reserved memory region, and allocates all * of the memory up to and including the hole. */ unsigned long hole_start, hole_end, size; struct page *pages; pages = NULL; size = (1 << order) << PAGE_SHIFT; hole_start = (image->control_page + (size - 1)) & ~(size - 1); hole_end = hole_start + size - 1; while (hole_end <= crashk_res.end) { unsigned long i; cond_resched(); if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; /* See if I overlap any of the segments */ for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; if ((hole_end >= mstart) && (hole_start <= mend)) { /* Advance the hole to the end of the segment */ hole_start = (mend + (size - 1)) & ~(size - 1); hole_end = hole_start + size - 1; break; } } /* If I don't overlap any segments I have found my hole! */ if (i == image->nr_segments) { pages = pfn_to_page(hole_start >> PAGE_SHIFT); image->control_page = hole_end; break; } } /* Ensure that these pages are decrypted if SME is enabled. */ if (pages) arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0); return pages; } struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) { struct page *pages = NULL; switch (image->type) { case KEXEC_TYPE_DEFAULT: pages = kimage_alloc_normal_control_pages(image, order); break; case KEXEC_TYPE_CRASH: pages = kimage_alloc_crash_control_pages(image, order); break; } return pages; } int kimage_crash_copy_vmcoreinfo(struct kimage *image) { struct page *vmcoreinfo_page; void *safecopy; if (image->type != KEXEC_TYPE_CRASH) return 0; /* * For kdump, allocate one vmcoreinfo safe copy from the * crash memory. as we have arch_kexec_protect_crashkres() * after kexec syscall, we naturally protect it from write * (even read) access under kernel direct mapping. But on * the other hand, we still need to operate it when crash * happens to generate vmcoreinfo note, hereby we rely on * vmap for this purpose. */ vmcoreinfo_page = kimage_alloc_control_pages(image, 0); if (!vmcoreinfo_page) { pr_warn("Could not allocate vmcoreinfo buffer\n"); return -ENOMEM; } safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); if (!safecopy) { pr_warn("Could not vmap vmcoreinfo buffer\n"); return -ENOMEM; } image->vmcoreinfo_data_copy = safecopy; crash_update_vmcoreinfo_safecopy(safecopy); return 0; } static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) image->entry++; if (image->entry == image->last_entry) { kimage_entry_t *ind_page; struct page *page; page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); if (!page) return -ENOMEM; ind_page = page_address(page); *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); } *image->entry = entry; image->entry++; *image->entry = 0; return 0; } static int kimage_set_destination(struct kimage *image, unsigned long destination) { int result; destination &= PAGE_MASK; result = kimage_add_entry(image, destination | IND_DESTINATION); return result; } static int kimage_add_page(struct kimage *image, unsigned long page) { int result; page &= PAGE_MASK; result = kimage_add_entry(image, page | IND_SOURCE); return result; } static void kimage_free_extra_pages(struct kimage *image) { /* Walk through and free any extra destination pages I may have */ kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ kimage_free_page_list(&image->unusable_pages); } void kimage_terminate(struct kimage *image) { if (*image->entry != 0) image->entry++; *image->entry = IND_DONE; } #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION) ? \ boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; page = boot_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; kimage_entry_t ind = 0; if (!image) return; if (image->vmcoreinfo_data_copy) { crash_update_vmcoreinfo_safecopy(NULL); vunmap(image->vmcoreinfo_data_copy); } kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { /* Free the previous indirection page */ if (ind & IND_INDIRECTION) kimage_free_entry(ind); /* Save this indirection page until we are * done with it. */ ind = entry; } else if (entry & IND_SOURCE) kimage_free_entry(entry); } /* Free the final indirection page */ if (ind & IND_INDIRECTION) kimage_free_entry(ind); /* Handle any machine specific cleanup */ machine_kexec_cleanup(image); /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. */ if (image->file_mode) kimage_file_post_load_cleanup(image); kfree(image); } static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page) { kimage_entry_t *ptr, entry; unsigned long destination = 0; for_each_kimage_entry(image, ptr, entry) { if (entry & IND_DESTINATION) destination = entry & PAGE_MASK; else if (entry & IND_SOURCE) { if (page == destination) return ptr; destination += PAGE_SIZE; } } return NULL; } static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long destination) { /* * Here we implement safeguards to ensure that a source page * is not copied to its destination page before the data on * the destination page is no longer useful. * * To do this we maintain the invariant that a source page is * either its own destination page, or it is not a * destination page at all. * * That is slightly stronger than required, but the proof * that no problems will not occur is trivial, and the * implementation is simply to verify. * * When allocating all pages normally this algorithm will run * in O(N) time, but in the worst case it will run in O(N^2) * time. If the runtime is a problem the data structures can * be fixed. */ struct page *page; unsigned long addr; /* * Walk through the list of destination pages, and see if I * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { addr = page_to_boot_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; } } page = NULL; while (1) { kimage_entry_t *old; /* Allocate a page, if we run out of memory give up */ page = kimage_alloc_pages(gfp_mask, 0); if (!page) return NULL; /* If the page cannot be used file it away */ if (page_to_boot_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_boot_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) break; /* If the page is not a destination page use it */ if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE)) break; /* * I know that the page is someones destination page. * See if there is already a source page for this * destination page. And if so swap the source pages. */ old = kimage_dst_used(image, addr); if (old) { /* If so move it */ unsigned long old_addr; struct page *old_page; old_addr = *old & PAGE_MASK; old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a * destination page, so return it if it's * gfp_flags honor the ones passed in. */ if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(old_page)) { kimage_free_pages(old_page); continue; } addr = old_addr; page = old_page; break; } /* Place the page on the destination list, to be used later */ list_add(&page->lru, &image->dest_pages); } return page; } static int kimage_load_normal_segment(struct kimage *image, struct kexec_segment *segment) { unsigned long maddr; size_t ubytes, mbytes; int result; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; result = 0; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; result = kimage_set_destination(image, maddr); if (result < 0) goto out; while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); if (!page) { result = -ENOMEM; goto out; } result = kimage_add_page(image, page_to_boot_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; ptr = kmap(page); /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { result = -EFAULT; goto out; } ubytes -= uchunk; maddr += mchunk; if (image->file_mode) kbuf += mchunk; else buf += mchunk; mbytes -= mchunk; cond_resched(); } out: return result; } static int kimage_load_crash_segment(struct kimage *image, struct kexec_segment *segment) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ unsigned long maddr; size_t ubytes, mbytes; int result; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; result = 0; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; page = boot_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; } arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap(page); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); if (mchunk > uchunk) { /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; goto out; } ubytes -= uchunk; maddr += mchunk; if (image->file_mode) kbuf += mchunk; else buf += mchunk; mbytes -= mchunk; cond_resched(); } out: return result; } int kimage_load_segment(struct kimage *image, struct kexec_segment *segment) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: result = kimage_load_normal_segment(image, segment); break; case KEXEC_TYPE_CRASH: result = kimage_load_crash_segment(image, segment); break; } return result; } struct kimage *kexec_image; struct kimage *kexec_crash_image; int kexec_load_disabled; /* * No panic_cpu check version of crash_kexec(). This function is called * only when panic_cpu holds the current CPU number; this is the only CPU * which processes crash_kexec routines. */ void __noclone __crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * * If the crash kernel was not located in a fixed area * of memory the xchg(&kexec_crash_image) would be * sufficient. But since I reuse the memory... */ if (mutex_trylock(&kexec_mutex)) { if (kexec_crash_image) { struct pt_regs fixed_regs; crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } mutex_unlock(&kexec_mutex); } } STACK_FRAME_NON_STANDARD(__crash_kexec); void crash_kexec(struct pt_regs *regs) { int old_cpu, this_cpu; /* * Only one CPU is allowed to execute the crash_kexec() code as with * panic(). Otherwise parallel calls of panic() and crash_kexec() * may stop each other. To exclude them, we use panic_cpu here too. */ this_cpu = raw_smp_processor_id(); old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ printk_safe_flush_on_panic(); __crash_kexec(regs); /* * Reset panic_cpu to allow another panic()/crash_kexec() * call. */ atomic_set(&panic_cpu, PANIC_CPU_INVALID); } } size_t crash_get_memory_size(void) { size_t size = 0; mutex_lock(&kexec_mutex); if (crashk_res.end != crashk_res.start) size = resource_size(&crashk_res); mutex_unlock(&kexec_mutex); return size; } void __weak crash_free_reserved_phys_range(unsigned long begin, unsigned long end) { unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); } int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long start, end; unsigned long old_size; struct resource *ram_res; mutex_lock(&kexec_mutex); if (kexec_crash_image) { ret = -ENOENT; goto unlock; } start = crashk_res.start; end = crashk_res.end; old_size = (end == 0) ? 0 : end - start + 1; if (new_size >= old_size) { ret = (new_size == old_size) ? 0 : -EINVAL; goto unlock; } ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); if (!ram_res) { ret = -ENOMEM; goto unlock; } start = roundup(start, KEXEC_CRASH_MEM_ALIGN); end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); ram_res->start = end; ram_res->end = crashk_res.end; ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; ram_res->name = "System RAM"; crashk_res.end = end - 1; insert_resource(&iomem_resource, ram_res); unlock: mutex_unlock(&kexec_mutex); return ret; } void crash_save_cpu(struct pt_regs *regs, int cpu) { struct elf_prstatus prstatus; u32 *buf; if ((cpu < 0) || (cpu >= nr_cpu_ids)) return; /* Using ELF notes here is opportunistic. * I need a well defined structure format * for the data I pass, and I need tags * on the data to indicate what information I have * squirrelled away. ELF notes happen to provide * all of that, so there is no need to invent something new. */ buf = (u32 *)per_cpu_ptr(crash_notes, cpu); if (!buf) return; memset(&prstatus, 0, sizeof(prstatus)); prstatus.pr_pid = current->pid; elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); } static int __init crash_notes_memory_init(void) { /* Allocate memory for saving cpu registers. */ size_t size, align; /* * crash_notes could be allocated across 2 vmalloc pages when percpu * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc * pages are also on 2 continuous physical pages. In this case the * 2nd part of crash_notes in 2nd page could be lost since only the * starting address and size of crash_notes are exported through sysfs. * Here round up the size of crash_notes to the nearest power of two * and pass it to __alloc_percpu as align value. This can make sure * crash_notes is allocated inside one physical page. */ size = sizeof(note_buf_t); align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE); /* * Break compile if size is bigger than PAGE_SIZE since crash_notes * definitely will be in 2 pages with that. */ BUILD_BUG_ON(size > PAGE_SIZE); crash_notes = __alloc_percpu(size, align); if (!crash_notes) { pr_warn("Memory allocation for saving cpu register states failed\n"); return -ENOMEM; } return 0; } subsys_initcall(crash_notes_memory_init); /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ int kernel_kexec(void) { int error = 0; if (!mutex_trylock(&kexec_mutex)) return -EBUSY; if (!kexec_image) { error = -EINVAL; goto Unlock; } #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { pm_prepare_console(); error = freeze_processes(); if (error) { error = -EBUSY; goto Restore_console; } suspend_console(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; /* At this point, dpm_suspend_start() has been called, * but *not* dpm_suspend_end(). We *must* call * dpm_suspend_end() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; error = disable_nonboot_cpus(); if (error) goto Enable_cpus; local_irq_disable(); error = syscore_suspend(); if (error) goto Enable_irqs; } else #endif { kexec_in_progress = true; kernel_restart_prepare(NULL); migrate_to_reboot_cpu(); /* * migrate_to_reboot_cpu() disables CPU hotplug assuming that * no further code needs to use CPU hotplug (which is true in * the reboot case). However, the kexec path depends on using * CPU hotplug again; so re-enable it here. */ cpu_hotplug_enable(); pr_emerg("Starting new kernel\n"); machine_shutdown(); } machine_kexec(kexec_image); #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { syscore_resume(); Enable_irqs: local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: resume_console(); thaw_processes(); Restore_console: pm_restore_console(); } #endif Unlock: mutex_unlock(&kexec_mutex); return error; } /* * Protection mechanism for crashkernel reserved memory after * the kdump kernel is loaded. * * Provide an empty default implementation here -- architecture * code may override this */ void __weak arch_kexec_protect_crashkres(void) {} void __weak arch_kexec_unprotect_crashkres(void) {} |
64 63 62 61 23 24 6 5 3 23 21 39 18 64 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | /* Copyright (C) 2010: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> * Copyright (C) 2015: Linus Lüssing <linus.luessing@c0d3.blue> * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. * * * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki. */ #include <linux/skbuff.h> #include <net/ipv6.h> #include <net/mld.h> #include <net/addrconf.h> #include <net/ip6_checksum.h> static int ipv6_mc_check_ip6hdr(struct sk_buff *skb) { const struct ipv6hdr *ip6h; unsigned int len; unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h); if (!pskb_may_pull(skb, offset)) return -EINVAL; ip6h = ipv6_hdr(skb); if (ip6h->version != 6) return -EINVAL; len = offset + ntohs(ip6h->payload_len); if (skb->len < len || len <= offset) return -EINVAL; return 0; } static int ipv6_mc_check_exthdrs(struct sk_buff *skb) { const struct ipv6hdr *ip6h; int offset; u8 nexthdr; __be16 frag_off; ip6h = ipv6_hdr(skb); if (ip6h->nexthdr != IPPROTO_HOPOPTS) return -ENOMSG; nexthdr = ip6h->nexthdr; offset = skb_network_offset(skb) + sizeof(*ip6h); offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) return -EINVAL; if (nexthdr != IPPROTO_ICMPV6) return -ENOMSG; skb_set_transport_header(skb, offset); return 0; } static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb); len += sizeof(struct mld2_report); return pskb_may_pull(skb, len) ? 0 : -EINVAL; } static int ipv6_mc_check_mld_query(struct sk_buff *skb) { struct mld_msg *mld; unsigned int len = skb_transport_offset(skb); /* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */ if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) return -EINVAL; len += sizeof(struct mld_msg); if (skb->len < len) return -EINVAL; /* MLDv1? */ if (skb->len != len) { /* or MLDv2? */ len += sizeof(struct mld2_query) - sizeof(struct mld_msg); if (skb->len < len || !pskb_may_pull(skb, len)) return -EINVAL; } mld = (struct mld_msg *)skb_transport_header(skb); /* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer * all-nodes destination address (ff02::1) for general queries */ if (ipv6_addr_any(&mld->mld_mca) && !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) return -EINVAL; return 0; } static int ipv6_mc_check_mld_msg(struct sk_buff *skb) { struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb); switch (mld->mld_type) { case ICMPV6_MGM_REDUCTION: case ICMPV6_MGM_REPORT: /* fall through */ return 0; case ICMPV6_MLD2_REPORT: return ipv6_mc_check_mld_reportv2(skb); case ICMPV6_MGM_QUERY: return ipv6_mc_check_mld_query(skb); default: return -ENOMSG; } } static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb) { return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo); } static int __ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed) { struct sk_buff *skb_chk = NULL; unsigned int transport_len; unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg); int ret = -EINVAL; transport_len = ntohs(ipv6_hdr(skb)->payload_len); transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr); skb_chk = skb_checksum_trimmed(skb, transport_len, ipv6_mc_validate_checksum); if (!skb_chk) goto err; if (!pskb_may_pull(skb_chk, len)) goto err; ret = ipv6_mc_check_mld_msg(skb_chk); if (ret) goto err; if (skb_trimmed) *skb_trimmed = skb_chk; /* free now unneeded clone */ else if (skb_chk != skb) kfree_skb(skb_chk); ret = 0; err: if (ret && skb_chk && skb_chk != skb) kfree_skb(skb_chk); return ret; } /** * ipv6_mc_check_mld - checks whether this is a sane MLD packet * @skb: the skb to validate * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional) * * Checks whether an IPv6 packet is a valid MLD packet. If so sets * skb transport header accordingly and returns zero. * * -EINVAL: A broken packet was detected, i.e. it violates some internet * standard * -ENOMSG: IP header validation succeeded but it is not an MLD packet. * -ENOMEM: A memory allocation failure happened. * * Optionally, an skb pointer might be provided via skb_trimmed (or set it * to NULL): After parsing an MLD packet successfully it will point to * an skb which has its tail aligned to the IP packet end. This might * either be the originally provided skb or a trimmed, cloned version if * the skb frame had data beyond the IP packet. A cloned skb allows us * to leave the original skb and its full frame unchanged (which might be * desirable for layer 2 frame jugglers). * * Caller needs to set the skb network header and free any returned skb if it * differs from the provided skb. */ int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed) { int ret; ret = ipv6_mc_check_ip6hdr(skb); if (ret < 0) return ret; ret = ipv6_mc_check_exthdrs(skb); if (ret < 0) return ret; return __ipv6_mc_check_mld(skb, skb_trimmed); } EXPORT_SYMBOL(ipv6_mc_check_mld); |
232 104 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | /* * Bridge per vlan tunnel port dst_metadata handling code * * Authors: * Roopa Prabhu <roopa@cumulusnetworks.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/switchdev.h> #include <net/dst_metadata.h> #include "br_private.h" #include "br_private_tunnel.h" static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct net_bridge_vlan *vle = ptr; __be64 tunid = *(__be64 *)arg->key; return vle->tinfo.tunnel_id != tunid; } static const struct rhashtable_params br_vlan_tunnel_rht_params = { .head_offset = offsetof(struct net_bridge_vlan, tnode), .key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id), .key_len = sizeof(__be64), .nelem_hint = 3, .locks_mul = 1, .obj_cmpfn = br_vlan_tunid_cmp, .automatic_shrinking = true, }; static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl, u64 tunnel_id) { return rhashtable_lookup_fast(tbl, &tunnel_id, br_vlan_tunnel_rht_params); } static void vlan_tunnel_info_release(struct net_bridge_vlan *vlan) { struct metadata_dst *tdst = rtnl_dereference(vlan->tinfo.tunnel_dst); WRITE_ONCE(vlan->tinfo.tunnel_id, 0); RCU_INIT_POINTER(vlan->tinfo.tunnel_dst, NULL); dst_release(&tdst->dst); } void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan) { if (!rcu_access_pointer(vlan->tinfo.tunnel_dst)) return; rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); vlan_tunnel_info_release(vlan); } static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, u32 tun_id) { struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst); __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id)); int err; if (metadata) return -EEXIST; metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, key, 0); if (!metadata) return -EINVAL; metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE; rcu_assign_pointer(vlan->tinfo.tunnel_dst, metadata); WRITE_ONCE(vlan->tinfo.tunnel_id, key); err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); if (err) goto out; return 0; out: vlan_tunnel_info_release(vlan); return err; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; ASSERT_RTNL(); vg = nbp_vlan_group(port); vlan = br_vlan_find(vg, vid); if (!vlan) return -EINVAL; return __vlan_tunnel_info_add(vg, vlan, tun_id); } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; ASSERT_RTNL(); vg = nbp_vlan_group(port); v = br_vlan_find(vg, vid); if (!v) return -ENOENT; vlan_tunnel_info_del(vg, v); return 0; } static void __vlan_tunnel_info_flush(struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) vlan_tunnel_info_del(vg, vlan); } void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = nbp_vlan_group(port); __vlan_tunnel_info_flush(vg); } int vlan_tunnel_init(struct net_bridge_vlan_group *vg) { return rhashtable_init(&vg->tunnel_hash, &br_vlan_tunnel_rht_params); } void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg) { rhashtable_destroy(&vg->tunnel_hash); } int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_port *p, struct net_bridge_vlan_group *vg) { struct ip_tunnel_info *tinfo = skb_tunnel_info(skb); struct net_bridge_vlan *vlan; if (!vg || !tinfo) return 0; /* if already tagged, ignore */ if (skb_vlan_tagged(skb)) return 0; /* lookup vid, given tunnel id */ vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id); if (!vlan) return 0; skb_dst_drop(skb); __vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid); return 0; } int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan) { struct metadata_dst *tunnel_dst; __be64 tunnel_id; int err; if (!vlan) return 0; tunnel_id = READ_ONCE(vlan->tinfo.tunnel_id); if (!tunnel_id || unlikely(!skb_vlan_tag_present(skb))) return 0; skb_dst_drop(skb); err = skb_vlan_pop(skb); if (err) return err; tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) skb_dst_set(skb, &tunnel_dst->dst); return 0; } |
2 3 37228 89 37214 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | /* SPDX-License-Identifier: GPL-2.0 */ /* * descriptor table internals; you almost certainly want file.h instead. */ #ifndef __LINUX_FDTABLE_H #define __LINUX_FDTABLE_H #include <linux/posix_types.h> #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/nospec.h> #include <linux/types.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/atomic.h> /* * The default fd array needs to be at least BITS_PER_LONG, * as this is the granularity returned by copy_fdset(). */ #define NR_OPEN_DEFAULT BITS_PER_LONG struct fdtable { unsigned int max_fds; struct file __rcu **fd; /* current fd array */ unsigned long *close_on_exec; unsigned long *open_fds; unsigned long *full_fds_bits; struct rcu_head rcu; }; static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->close_on_exec); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->open_fds); } /* * Open file table structure */ struct files_struct { /* * read mostly part */ atomic_t count; bool resize_in_progress; wait_queue_head_t resize_wait; struct fdtable __rcu *fdt; struct fdtable fdtab; /* * written part on a separate cache line in SMP */ spinlock_t file_lock ____cacheline_aligned_in_smp; unsigned int next_fd; unsigned long close_on_exec_init[1]; unsigned long open_fds_init[1]; unsigned long full_fds_bits_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; struct file_operations; struct vfsmount; struct dentry; #define rcu_dereference_check_fdtable(files, fdtfd) \ rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock)) #define files_fdtable(files) \ rcu_dereference_check_fdtable((files), (files)->fdt) /* * The caller must ensure that fd table isn't shared or hold rcu or file lock */ static inline struct file *__fcheck_files(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = rcu_dereference_raw(files->fdt); if (fd < fdt->max_fds) { fd = array_index_nospec(fd, fdt->max_fds); return rcu_dereference_raw(fdt->fd[fd]); } return NULL; } static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd) { RCU_LOCKDEP_WARN(!rcu_read_lock_held() && !lockdep_is_held(&files->file_lock), "suspicious rcu_dereference_check() usage"); return __fcheck_files(files, fd); } /* * Check whether the specified fd has an open file. */ #define fcheck(fd) fcheck_files(current->files, fd) struct task_struct; struct files_struct *get_files_struct(struct task_struct *); void put_files_struct(struct files_struct *fs); void reset_files_struct(struct files_struct *); int unshare_files(struct files_struct **); struct files_struct *dup_fd(struct files_struct *, int *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, int (*)(const void *, struct file *, unsigned), const void *); extern int __alloc_fd(struct files_struct *files, unsigned start, unsigned end, unsigned flags); extern void __fd_install(struct files_struct *files, unsigned int fd, struct file *file); extern int __close_fd(struct files_struct *files, unsigned int fd); extern struct kmem_cache *files_cachep; #endif /* __LINUX_FDTABLE_H */ |
7 116 116 117 117 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | /* * Key-agreement Protocol Primitives (KPP) * * Copyright (c) 2016, Intel Corporation * Authors: Salvatore Benedetto <salvatore.benedetto@intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/crypto.h> #include <crypto/algapi.h> #include <linux/cryptouser.h> #include <linux/compiler.h> #include <net/netlink.h> #include <crypto/kpp.h> #include <crypto/internal/kpp.h> #include "internal.h" #ifdef CONFIG_NET static int crypto_kpp_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_kpp rkpp; strncpy(rkpp.type, "kpp", sizeof(rkpp.type)); if (nla_put(skb, CRYPTOCFGA_REPORT_KPP, sizeof(struct crypto_report_kpp), &rkpp)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } #else static int crypto_kpp_report(struct sk_buff *skb, struct crypto_alg *alg) { return -ENOSYS; } #endif static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_kpp_show(struct seq_file *m, struct crypto_alg *alg) { seq_puts(m, "type : kpp\n"); } static void crypto_kpp_exit_tfm(struct crypto_tfm *tfm) { struct crypto_kpp *kpp = __crypto_kpp_tfm(tfm); struct kpp_alg *alg = crypto_kpp_alg(kpp); alg->exit(kpp); } static int crypto_kpp_init_tfm(struct crypto_tfm *tfm) { struct crypto_kpp *kpp = __crypto_kpp_tfm(tfm); struct kpp_alg *alg = crypto_kpp_alg(kpp); if (alg->exit) kpp->base.exit = crypto_kpp_exit_tfm; if (alg->init) return alg->init(kpp); return 0; } static const struct crypto_type crypto_kpp_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_kpp_init_tfm, #ifdef CONFIG_PROC_FS .show = crypto_kpp_show, #endif .report = crypto_kpp_report, .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_KPP, .tfmsize = offsetof(struct crypto_kpp, base), }; struct crypto_kpp *crypto_alloc_kpp(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_kpp_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_kpp); static void kpp_prepare_alg(struct kpp_alg *alg) { struct crypto_alg *base = &alg->base; base->cra_type = &crypto_kpp_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_KPP; } int crypto_register_kpp(struct kpp_alg *alg) { struct crypto_alg *base = &alg->base; kpp_prepare_alg(alg); return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_kpp); void crypto_unregister_kpp(struct kpp_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_kpp); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Key-agreement Protocol Primitives"); |
251 119 252 92 114 113 246 247 247 4 4 247 67 68 68 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_inode_item.h" #include "xfs_trace.h" #include <linux/iversion.h> /* * Add a locked inode to the transaction. * * The inode must be locked, and it cannot be associated with any transaction. * If lock_flags is non-zero the inode will be unlocked on transaction commit. */ void xfs_trans_ijoin( struct xfs_trans *tp, struct xfs_inode *ip, uint lock_flags) { xfs_inode_log_item_t *iip; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; ASSERT(iip->ili_lock_flags == 0); iip->ili_lock_flags = lock_flags; ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); /* * Get a log_item_desc to point at the new item. */ xfs_trans_add_item(tp, &iip->ili_item); } /* * Transactional inode timestamp update. Requires the inode to be locked and * joined to the transaction supplied. Relies on the transaction subsystem to * track dirty state and update/writeback the inode accordingly. */ void xfs_trans_ichgtime( struct xfs_trans *tp, struct xfs_inode *ip, int flags) { struct inode *inode = VFS_I(ip); struct timespec64 tv; ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); tv = current_time(inode); if (flags & XFS_ICHGTIME_MOD) inode->i_mtime = tv; if (flags & XFS_ICHGTIME_CHG) inode->i_ctime = tv; } /* * This is called to mark the fields indicated in fieldmask as needing * to be logged when the transaction is committed. The inode must * already be associated with the given transaction. * * The values for fieldmask are defined in xfs_inode_item.h. We always * log all of the core inode if any of it has changed, and we always log * all of the inline data/extents/b-tree root if any of them has changed. */ void xfs_trans_log_inode( xfs_trans_t *tp, xfs_inode_t *ip, uint flags) { struct inode *inode = VFS_I(ip); ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); /* * Don't bother with i_lock for the I_DIRTY_TIME check here, as races * don't matter - we either will need an extra transaction in 24 hours * to log the timestamps, or will clear already cleared fields in the * worst case. */ if (inode->i_state & I_DIRTY_TIME) { spin_lock(&inode->i_lock); inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); } /* * Record the specific change for fdatasync optimisation. This * allows fdatasync to skip log forces for inodes that are only * timestamp dirty. We do this before the change count so that * the core being logged in this case does not impact on fdatasync * behaviour. */ ip->i_itemp->ili_fsync_fields |= flags; /* * First time we log the inode in a transaction, bump the inode change * counter if it is configured for this to occur. While we have the * inode locked exclusively for metadata modification, we can usually * avoid setting XFS_ILOG_CORE if no one has queried the value since * the last time it was incremented. If we have XFS_ILOG_CORE already * set however, then go ahead and bump the i_version counter * unconditionally. */ if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) && IS_I_VERSION(VFS_I(ip))) { if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE)) flags |= XFS_ILOG_CORE; } tp->t_flags |= XFS_TRANS_DIRTY; /* * Always OR in the bits from the ili_last_fields field. * This is to coordinate with the xfs_iflush() and xfs_iflush_done() * routines in the eventual clearing of the ili_fields bits. * See the big comment in xfs_iflush() for an explanation of * this coordination mechanism. */ flags |= ip->i_itemp->ili_last_fields; ip->i_itemp->ili_fields |= flags; } int xfs_trans_roll_inode( struct xfs_trans **tpp, struct xfs_inode *ip) { int error; xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); error = xfs_trans_roll(tpp); if (!error) xfs_trans_ijoin(*tpp, ip, 0); return error; } |
640 641 641 553 552 551 640 21 12 21 553 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | /* * AppArmor security module * * This file contains AppArmor security identifier (secid) manipulation fns * * Copyright 2009-2017 Canonical Ltd. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation, version 2 of the * License. * * * AppArmor allocates a unique secid for every label used. If a label * is replaced it receives the secid of the label it is replacing. */ #include <linux/errno.h> #include <linux/err.h> #include <linux/gfp.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/spinlock.h> #include "include/cred.h" #include "include/lib.h" #include "include/secid.h" #include "include/label.h" #include "include/policy_ns.h" /* * secids - do not pin labels with a refcount. They rely on the label * properly updating/freeing them */ #define AA_FIRST_SECID 1 static DEFINE_IDR(aa_secids); static DEFINE_SPINLOCK(secid_lock); /* * TODO: allow policy to reserve a secid range? * TODO: add secid pinning * TODO: use secid_update in label replace */ /** * aa_secid_update - update a secid mapping to a new label * @secid: secid to update * @label: label the secid will now map to */ void aa_secid_update(u32 secid, struct aa_label *label) { unsigned long flags; spin_lock_irqsave(&secid_lock, flags); idr_replace(&aa_secids, label, secid); spin_unlock_irqrestore(&secid_lock, flags); } /** * * see label for inverse aa_label_to_secid */ struct aa_label *aa_secid_to_label(u32 secid) { struct aa_label *label; rcu_read_lock(); label = idr_find(&aa_secids, secid); rcu_read_unlock(); return label; } int apparmor_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { /* TODO: cache secctx and ref count so we don't have to recreate */ struct aa_label *label = aa_secid_to_label(secid); int len; AA_BUG(!seclen); if (!label) return -EINVAL; if (secdata) len = aa_label_asxprint(secdata, root_ns, label, FLAG_SHOW_MODE | FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED | FLAG_ABS_ROOT, GFP_ATOMIC); else len = aa_label_snxprint(NULL, 0, root_ns, label, FLAG_SHOW_MODE | FLAG_VIEW_SUBNS | FLAG_HIDDEN_UNCONFINED | FLAG_ABS_ROOT); if (len < 0) return -ENOMEM; *seclen = len; return 0; } int apparmor_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { struct aa_label *label; label = aa_label_strn_parse(&root_ns->unconfined->label, secdata, seclen, GFP_KERNEL, false, false); if (IS_ERR(label)) return PTR_ERR(label); *secid = label->secid; return 0; } void apparmor_release_secctx(char *secdata, u32 seclen) { kfree(secdata); } /** * aa_alloc_secid - allocate a new secid for a profile * @label: the label to allocate a secid for * @gfp: memory allocation flags * * Returns: 0 with @label->secid initialized * <0 returns error with @label->secid set to AA_SECID_INVALID */ int aa_alloc_secid(struct aa_label *label, gfp_t gfp) { unsigned long flags; int ret; idr_preload(gfp); spin_lock_irqsave(&secid_lock, flags); ret = idr_alloc(&aa_secids, label, AA_FIRST_SECID, 0, GFP_ATOMIC); spin_unlock_irqrestore(&secid_lock, flags); idr_preload_end(); if (ret < 0) { label->secid = AA_SECID_INVALID; return ret; } AA_BUG(ret == AA_SECID_INVALID); label->secid = ret; return 0; } /** * aa_free_secid - free a secid * @secid: secid to free */ void aa_free_secid(u32 secid) { unsigned long flags; spin_lock_irqsave(&secid_lock, flags); idr_remove(&aa_secids, secid); spin_unlock_irqrestore(&secid_lock, flags); } void aa_secids_init(void) { idr_init_base(&aa_secids, AA_FIRST_SECID); } |
74 74 74 74 72 72 65 65 50 50 50 80 65 52 80 85 80 72 80 72 71 85 80 50 80 85 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 | /* * Copyright (C) 2017 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License v2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. */ #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/debugfs.h> #include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) { if (stat->nr_samples) { seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu", stat->nr_samples, stat->mean, stat->min, stat->max); } else { seq_puts(m, "samples=0"); } } static int queue_poll_stat_show(void *data, struct seq_file *m) { struct request_queue *q = data; int bucket; for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) { seq_printf(m, "read (%d Bytes): ", 1 << (9+bucket)); print_stat(m, &q->poll_stat[2*bucket]); seq_puts(m, "\n"); seq_printf(m, "write (%d Bytes): ", 1 << (9+bucket)); print_stat(m, &q->poll_stat[2*bucket+1]); seq_puts(m, "\n"); } return 0; } static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos) __acquires(&q->requeue_lock) { struct request_queue *q = m->private; spin_lock_irq(&q->requeue_lock); return seq_list_start(&q->requeue_list, *pos); } static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos) { struct request_queue *q = m->private; return seq_list_next(v, &q->requeue_list, pos); } static void queue_requeue_list_stop(struct seq_file *m, void *v) __releases(&q->requeue_lock) { struct request_queue *q = m->private; spin_unlock_irq(&q->requeue_lock); } static const struct seq_operations queue_requeue_list_seq_ops = { .start = queue_requeue_list_start, .next = queue_requeue_list_next, .stop = queue_requeue_list_stop, .show = blk_mq_debugfs_rq_show, }; static int blk_flags_show(struct seq_file *m, const unsigned long flags, const char *const *flag_name, int flag_name_count) { bool sep = false; int i; for (i = 0; i < sizeof(flags) * BITS_PER_BYTE; i++) { if (!(flags & BIT(i))) continue; if (sep) seq_puts(m, "|"); sep = true; if (i < flag_name_count && flag_name[i]) seq_puts(m, flag_name[i]); else seq_printf(m, "%d", i); } return 0; } static int queue_pm_only_show(void *data, struct seq_file *m) { struct request_queue *q = data; seq_printf(m, "%d\n", atomic_read(&q->pm_only)); return 0; } #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(QUEUED), QUEUE_FLAG_NAME(STOPPED), QUEUE_FLAG_NAME(DYING), QUEUE_FLAG_NAME(BYPASS), QUEUE_FLAG_NAME(BIDI), QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), QUEUE_FLAG_NAME(FAIL_IO), QUEUE_FLAG_NAME(NONROT), QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(DISCARD), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(ADD_RANDOM), QUEUE_FLAG_NAME(SECERASE), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(DEAD), QUEUE_FLAG_NAME(INIT_DONE), QUEUE_FLAG_NAME(NO_SG_MERGE), QUEUE_FLAG_NAME(POLL), QUEUE_FLAG_NAME(WC), QUEUE_FLAG_NAME(FUA), QUEUE_FLAG_NAME(FLUSH_NQ), QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(POLL_STATS), QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), QUEUE_FLAG_NAME(QUIESCED), }; #undef QUEUE_FLAG_NAME static int queue_state_show(void *data, struct seq_file *m) { struct request_queue *q = data; blk_flags_show(m, q->queue_flags, blk_queue_flag_name, ARRAY_SIZE(blk_queue_flag_name)); seq_puts(m, "\n"); return 0; } static ssize_t queue_state_write(void *data, const char __user *buf, size_t count, loff_t *ppos) { struct request_queue *q = data; char opbuf[16] = { }, *op; /* * The "state" attribute is removed after blk_cleanup_queue() has called * blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set to avoid * triggering a use-after-free. */ if (blk_queue_dead(q)) return -ENOENT; if (count >= sizeof(opbuf)) { pr_err("%s: operation too long\n", __func__); goto inval; } if (copy_from_user(opbuf, buf, count)) return -EFAULT; op = strstrip(opbuf); if (strcmp(op, "run") == 0) { blk_mq_run_hw_queues(q, true); } else if (strcmp(op, "start") == 0) { blk_mq_start_stopped_hw_queues(q, true); } else if (strcmp(op, "kick") == 0) { blk_mq_kick_requeue_list(q); } else { pr_err("%s: unsupported operation '%s'\n", __func__, op); inval: pr_err("%s: use 'run', 'start' or 'kick'\n", __func__); return -EINVAL; } return count; } static int queue_write_hint_show(void *data, struct seq_file *m) { struct request_queue *q = data; int i; for (i = 0; i < BLK_MAX_WRITE_HINTS; i++) seq_printf(m, "hint%d: %llu\n", i, q->write_hints[i]); return 0; } static ssize_t queue_write_hint_store(void *data, const char __user *buf, size_t count, loff_t *ppos) { struct request_queue *q = data; int i; for (i = 0; i < BLK_MAX_WRITE_HINTS; i++) q->write_hints[i] = 0; return count; } static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, { "pm_only", 0600, queue_pm_only_show, NULL }, { "state", 0600, queue_state_show, queue_state_write }, { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store }, { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, { }, }; #define HCTX_STATE_NAME(name) [BLK_MQ_S_##name] = #name static const char *const hctx_state_name[] = { HCTX_STATE_NAME(STOPPED), HCTX_STATE_NAME(TAG_ACTIVE), HCTX_STATE_NAME(SCHED_RESTART), }; #undef HCTX_STATE_NAME static int hctx_state_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; blk_flags_show(m, hctx->state, hctx_state_name, ARRAY_SIZE(hctx_state_name)); seq_puts(m, "\n"); return 0; } #define BLK_TAG_ALLOC_NAME(name) [BLK_TAG_ALLOC_##name] = #name static const char *const alloc_policy_name[] = { BLK_TAG_ALLOC_NAME(FIFO), BLK_TAG_ALLOC_NAME(RR), }; #undef BLK_TAG_ALLOC_NAME #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(SHOULD_MERGE), HCTX_FLAG_NAME(TAG_SHARED), HCTX_FLAG_NAME(SG_MERGE), HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(NO_SCHED), }; #undef HCTX_FLAG_NAME static int hctx_flags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags); seq_puts(m, "alloc_policy="); if (alloc_policy < ARRAY_SIZE(alloc_policy_name) && alloc_policy_name[alloc_policy]) seq_puts(m, alloc_policy_name[alloc_policy]); else seq_printf(m, "%d", alloc_policy); seq_puts(m, " "); blk_flags_show(m, hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy), hctx_flag_name, ARRAY_SIZE(hctx_flag_name)); seq_puts(m, "\n"); return 0; } #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const op_name[] = { REQ_OP_NAME(READ), REQ_OP_NAME(WRITE), REQ_OP_NAME(FLUSH), REQ_OP_NAME(DISCARD), REQ_OP_NAME(ZONE_REPORT), REQ_OP_NAME(SECURE_ERASE), REQ_OP_NAME(ZONE_RESET), REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(SCSI_IN), REQ_OP_NAME(SCSI_OUT), REQ_OP_NAME(DRV_IN), REQ_OP_NAME(DRV_OUT), }; #undef REQ_OP_NAME #define CMD_FLAG_NAME(name) [__REQ_##name] = #name static const char *const cmd_flag_name[] = { CMD_FLAG_NAME(FAILFAST_DEV), CMD_FLAG_NAME(FAILFAST_TRANSPORT), CMD_FLAG_NAME(FAILFAST_DRIVER), CMD_FLAG_NAME(SYNC), CMD_FLAG_NAME(META), CMD_FLAG_NAME(PRIO), CMD_FLAG_NAME(NOMERGE), CMD_FLAG_NAME(IDLE), CMD_FLAG_NAME(INTEGRITY), CMD_FLAG_NAME(FUA), CMD_FLAG_NAME(PREFLUSH), CMD_FLAG_NAME(RAHEAD), CMD_FLAG_NAME(BACKGROUND), CMD_FLAG_NAME(NOUNMAP), CMD_FLAG_NAME(NOWAIT), }; #undef CMD_FLAG_NAME #define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name static const char *const rqf_name[] = { RQF_NAME(SORTED), RQF_NAME(STARTED), RQF_NAME(QUEUED), RQF_NAME(SOFTBARRIER), RQF_NAME(FLUSH_SEQ), RQF_NAME(MIXED_MERGE), RQF_NAME(MQ_INFLIGHT), RQF_NAME(DONTPREP), RQF_NAME(PREEMPT), RQF_NAME(COPY_USER), RQF_NAME(FAILED), RQF_NAME(QUIET), RQF_NAME(ELVPRIV), RQF_NAME(IO_STAT), RQF_NAME(ALLOCED), RQF_NAME(PM), RQF_NAME(HASHED), RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_LOCKED), RQF_NAME(MQ_POLL_SLEPT), }; #undef RQF_NAME static const char *const blk_mq_rq_state_name_array[] = { [MQ_RQ_IDLE] = "idle", [MQ_RQ_IN_FLIGHT] = "in_flight", [MQ_RQ_COMPLETE] = "complete", }; static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) { if (WARN_ON_ONCE((unsigned int)rq_state >= ARRAY_SIZE(blk_mq_rq_state_name_array))) return "(?)"; return blk_mq_rq_state_name_array[rq_state]; } int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; const unsigned int op = rq->cmd_flags & REQ_OP_MASK; seq_printf(m, "%p {.op=", rq); if (op < ARRAY_SIZE(op_name) && op_name[op]) seq_printf(m, "%s", op_name[op]); else seq_printf(m, "%d", op); seq_puts(m, ", .cmd_flags="); blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name, ARRAY_SIZE(cmd_flag_name)); seq_puts(m, ", .rq_flags="); blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, ARRAY_SIZE(rqf_name)); seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, rq->internal_tag); if (mq_ops->show_rq) mq_ops->show_rq(m, rq); seq_puts(m, "}\n"); return 0; } EXPORT_SYMBOL_GPL(__blk_mq_debugfs_rq_show); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) { return __blk_mq_debugfs_rq_show(m, list_entry_rq(v)); } EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show); static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos) __acquires(&hctx->lock) { struct blk_mq_hw_ctx *hctx = m->private; spin_lock(&hctx->lock); return seq_list_start(&hctx->dispatch, *pos); } static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos) { struct blk_mq_hw_ctx *hctx = m->private; return seq_list_next(v, &hctx->dispatch, pos); } static void hctx_dispatch_stop(struct seq_file *m, void *v) __releases(&hctx->lock) { struct blk_mq_hw_ctx *hctx = m->private; spin_unlock(&hctx->lock); } static const struct seq_operations hctx_dispatch_seq_ops = { .start = hctx_dispatch_start, .next = hctx_dispatch_next, .stop = hctx_dispatch_stop, .show = blk_mq_debugfs_rq_show, }; struct show_busy_params { struct seq_file *m; struct blk_mq_hw_ctx *hctx; }; /* * Note: the state of a request may change while this function is in progress, * e.g. due to a concurrent blk_mq_finish_request() call. */ static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) { const struct show_busy_params *params = data; if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx && blk_mq_rq_state(rq) != MQ_RQ_IDLE) __blk_mq_debugfs_rq_show(params->m, list_entry_rq(&rq->queuelist)); } static int hctx_busy_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct show_busy_params params = { .m = m, .hctx = hctx }; blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, ¶ms); return 0; } static int hctx_ctx_map_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; sbitmap_bitmap_show(&hctx->ctx_map, m); return 0; } static void blk_mq_debugfs_tags_show(struct seq_file *m, struct blk_mq_tags *tags) { seq_printf(m, "nr_tags=%u\n", tags->nr_tags); seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); seq_printf(m, "active_queues=%d\n", atomic_read(&tags->active_queues)); seq_puts(m, "\nbitmap_tags:\n"); sbitmap_queue_show(&tags->bitmap_tags, m); if (tags->nr_reserved_tags) { seq_puts(m, "\nbreserved_tags:\n"); sbitmap_queue_show(&tags->breserved_tags, m); } } static int hctx_tags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->sysfs_lock); if (res) goto out; if (hctx->tags) blk_mq_debugfs_tags_show(m, hctx->tags); mutex_unlock(&q->sysfs_lock); out: return res; } static int hctx_tags_bitmap_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->sysfs_lock); if (res) goto out; if (hctx->tags) sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); mutex_unlock(&q->sysfs_lock); out: return res; } static int hctx_sched_tags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->sysfs_lock); if (res) goto out; if (hctx->sched_tags) blk_mq_debugfs_tags_show(m, hctx->sched_tags); mutex_unlock(&q->sysfs_lock); out: return res; } static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; struct request_queue *q = hctx->queue; int res; res = mutex_lock_interruptible(&q->sysfs_lock); if (res) goto out; if (hctx->sched_tags) sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); mutex_unlock(&q->sysfs_lock); out: return res; } static int hctx_io_poll_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; seq_printf(m, "considered=%lu\n", hctx->poll_considered); seq_printf(m, "invoked=%lu\n", hctx->poll_invoked); seq_printf(m, "success=%lu\n", hctx->poll_success); return 0; } static ssize_t hctx_io_poll_write(void *data, const char __user *buf, size_t count, loff_t *ppos) { struct blk_mq_hw_ctx *hctx = data; hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; return count; } static int hctx_dispatched_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; int i; seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]); for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { unsigned int d = 1U << (i - 1); seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]); } seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]); return 0; } static ssize_t hctx_dispatched_write(void *data, const char __user *buf, size_t count, loff_t *ppos) { struct blk_mq_hw_ctx *hctx = data; int i; for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) hctx->dispatched[i] = 0; return count; } static int hctx_queued_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; seq_printf(m, "%lu\n", hctx->queued); return 0; } static ssize_t hctx_queued_write(void *data, const char __user *buf, size_t count, loff_t *ppos) { struct blk_mq_hw_ctx *hctx = data; hctx->queued = 0; return count; } static int hctx_run_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; seq_printf(m, "%lu\n", hctx->run); return 0; } static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count, loff_t *ppos) { struct blk_mq_hw_ctx *hctx = data; hctx->run = 0; return count; } static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); return 0; } static int hctx_dispatch_busy_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; seq_printf(m, "%u\n", hctx->dispatch_busy); return 0; } static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) __acquires(&ctx->lock) { struct blk_mq_ctx *ctx = m->private; spin_lock(&ctx->lock); return seq_list_start(&ctx->rq_list, *pos); } static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos) { struct blk_mq_ctx *ctx = m->private; return seq_list_next(v, &ctx->rq_list, pos); } static void ctx_rq_list_stop(struct seq_file *m, void *v) __releases(&ctx->lock) { struct blk_mq_ctx *ctx = m->private; spin_unlock(&ctx->lock); } static const struct seq_operations ctx_rq_list_seq_ops = { .start = ctx_rq_list_start, .next = ctx_rq_list_next, .stop = ctx_rq_list_stop, .show = blk_mq_debugfs_rq_show, }; static int ctx_dispatched_show(void *data, struct seq_file *m) { struct blk_mq_ctx *ctx = data; seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]); return 0; } static ssize_t ctx_dispatched_write(void *data, const char __user *buf, size_t count, loff_t *ppos) { struct blk_mq_ctx *ctx = data; ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0; return count; } static int ctx_merged_show(void *data, struct seq_file *m) { struct blk_mq_ctx *ctx = data; seq_printf(m, "%lu\n", ctx->rq_merged); return 0; } static ssize_t ctx_merged_write(void *data, const char __user *buf, size_t count, loff_t *ppos) { struct blk_mq_ctx *ctx = data; ctx->rq_merged = 0; return count; } static int ctx_completed_show(void *data, struct seq_file *m) { struct blk_mq_ctx *ctx = data; seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]); return 0; } static ssize_t ctx_completed_write(void *data, const char __user *buf, size_t count, loff_t *ppos) { struct blk_mq_ctx *ctx = data; ctx->rq_completed[0] = ctx->rq_completed[1] = 0; return count; } static int blk_mq_debugfs_show(struct seq_file *m, void *v) { const struct blk_mq_debugfs_attr *attr = m->private; void *data = d_inode(m->file->f_path.dentry->d_parent)->i_private; return attr->show(data, m); } static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *m = file->private_data; const struct blk_mq_debugfs_attr *attr = m->private; void *data = d_inode(file->f_path.dentry->d_parent)->i_private; /* * Attributes that only implement .seq_ops are read-only and 'attr' is * the same with 'data' in this case. */ if (attr == data || !attr->write) return -EPERM; return attr->write(data, buf, count, ppos); } static int blk_mq_debugfs_open(struct inode *inode, struct file *file) { const struct blk_mq_debugfs_attr *attr = inode->i_private; void *data = d_inode(file->f_path.dentry->d_parent)->i_private; struct seq_file *m; int ret; if (attr->seq_ops) { ret = seq_open(file, attr->seq_ops); if (!ret) { m = file->private_data; m->private = data; } return ret; } if (WARN_ON_ONCE(!attr->show)) return -EPERM; return single_open(file, blk_mq_debugfs_show, inode->i_private); } static int blk_mq_debugfs_release(struct inode *inode, struct file *file) { const struct blk_mq_debugfs_attr *attr = inode->i_private; if (attr->show) return single_release(inode, file); else return seq_release(inode, file); } static const struct file_operations blk_mq_debugfs_fops = { .open = blk_mq_debugfs_open, .read = seq_read, .write = blk_mq_debugfs_write, .llseek = seq_lseek, .release = blk_mq_debugfs_release, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, hctx_state_show}, {"flags", 0400, hctx_flags_show}, {"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops}, {"busy", 0400, hctx_busy_show}, {"ctx_map", 0400, hctx_ctx_map_show}, {"tags", 0400, hctx_tags_show}, {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, {"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write}, {"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write}, {"queued", 0600, hctx_queued_show, hctx_queued_write}, {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {"rq_list", 0400, .seq_ops = &ctx_rq_list_seq_ops}, {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write}, {"merged", 0600, ctx_merged_show, ctx_merged_write}, {"completed", 0600, ctx_completed_show, ctx_completed_write}, {}, }; static bool debugfs_create_files(struct dentry *parent, void *data, const struct blk_mq_debugfs_attr *attr) { d_inode(parent)->i_private = data; for (; attr->name; attr++) { if (!debugfs_create_file(attr->name, attr->mode, parent, (void *)attr, &blk_mq_debugfs_fops)) return false; } return true; } int blk_mq_debugfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; if (!blk_debugfs_root) return -ENOENT; q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), blk_debugfs_root); if (!q->debugfs_dir) return -ENOMEM; if (!debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs)) goto err; /* * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir * didn't exist yet (because we don't know what to name the directory * until the queue is registered to a gendisk). */ if (q->elevator && !q->sched_debugfs_dir) blk_mq_debugfs_register_sched(q); /* Similarly, blk_mq_init_hctx() couldn't do this previously. */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->debugfs_dir && blk_mq_debugfs_register_hctx(q, hctx)) goto err; if (q->elevator && !hctx->sched_debugfs_dir && blk_mq_debugfs_register_sched_hctx(q, hctx)) goto err; } return 0; err: blk_mq_debugfs_unregister(q); return -ENOMEM; } void blk_mq_debugfs_unregister(struct request_queue *q) { debugfs_remove_recursive(q->debugfs_dir); q->sched_debugfs_dir = NULL; q->debugfs_dir = NULL; } static int blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { struct dentry *ctx_dir; char name[20]; snprintf(name, sizeof(name), "cpu%u", ctx->cpu); ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir); if (!ctx_dir) return -ENOMEM; if (!debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs)) return -ENOMEM; return 0; } int blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { struct blk_mq_ctx *ctx; char name[20]; int i; if (!q->debugfs_dir) return -ENOENT; snprintf(name, sizeof(name), "hctx%u", hctx->queue_num); hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir); if (!hctx->debugfs_dir) return -ENOMEM; if (!debugfs_create_files(hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs)) goto err; hctx_for_each_ctx(hctx, ctx, i) { if (blk_mq_debugfs_register_ctx(hctx, ctx)) goto err; } return 0; err: blk_mq_debugfs_unregister_hctx(hctx); return -ENOMEM; } void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) { debugfs_remove_recursive(hctx->debugfs_dir); hctx->sched_debugfs_dir = NULL; hctx->debugfs_dir = NULL; } int blk_mq_debugfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_debugfs_register_hctx(q, hctx)) return -ENOMEM; } return 0; } void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_unregister_hctx(hctx); } int blk_mq_debugfs_register_sched(struct request_queue *q) { struct elevator_type *e = q->elevator->type; if (!q->debugfs_dir) return -ENOENT; if (!e->queue_debugfs_attrs) return 0; q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir); if (!q->sched_debugfs_dir) return -ENOMEM; if (!debugfs_create_files(q->sched_debugfs_dir, q, e->queue_debugfs_attrs)) goto err; return 0; err: blk_mq_debugfs_unregister_sched(q); return -ENOMEM; } void blk_mq_debugfs_unregister_sched(struct request_queue *q) { debugfs_remove_recursive(q->sched_debugfs_dir); q->sched_debugfs_dir = NULL; } int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { struct elevator_type *e = q->elevator->type; if (!hctx->debugfs_dir) return -ENOENT; if (!e->hctx_debugfs_attrs) return 0; hctx->sched_debugfs_dir = debugfs_create_dir("sched", hctx->debugfs_dir); if (!hctx->sched_debugfs_dir) return -ENOMEM; if (!debugfs_create_files(hctx->sched_debugfs_dir, hctx, e->hctx_debugfs_attrs)) return -ENOMEM; return 0; } void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { debugfs_remove_recursive(hctx->sched_debugfs_dir); hctx->sched_debugfs_dir = NULL; } |
4 4 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | #ifndef _TFRC_H_ #define _TFRC_H_ /* * Copyright (c) 2007 The University of Aberdeen, Scotland, UK * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz> * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #include <linux/types.h> #include <linux/math64.h> #include "../../dccp.h" /* internal includes that this library exports: */ #include "loss_interval.h" #include "packet_history.h" #ifdef CONFIG_IP_DCCP_TFRC_DEBUG extern bool tfrc_debug; #define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a) #else #define tfrc_pr_debug(format, a...) #endif /* integer-arithmetic divisions of type (a * 1000000)/b */ static inline u64 scaled_div(u64 a, u64 b) { BUG_ON(b == 0); return div64_u64(a * 1000000, b); } static inline u32 scaled_div32(u64 a, u64 b) { u64 result = scaled_div(a, b); if (result > UINT_MAX) { DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX", (unsigned long long)a, (unsigned long long)b); return UINT_MAX; } return result; } /** * tfrc_ewma - Exponentially weighted moving average * @weight: Weight to be used as damping factor, in units of 1/10 */ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) { return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval; } u32 tfrc_calc_x(u16 s, u32 R, u32 p); u32 tfrc_calc_x_reverse_lookup(u32 fvalue); u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); int tfrc_tx_packet_history_init(void); void tfrc_tx_packet_history_exit(void); int tfrc_rx_packet_history_init(void); void tfrc_rx_packet_history_exit(void); int tfrc_li_init(void); void tfrc_li_exit(void); #ifdef CONFIG_IP_DCCP_TFRC_LIB int tfrc_lib_init(void); void tfrc_lib_exit(void); #else #define tfrc_lib_init() (0) #define tfrc_lib_exit() #endif #endif /* _TFRC_H_ */ |
6 3 3 13 13 13 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 | /* * vimc-common.c Virtual Media Controller Driver * * Copyright (C) 2015-2017 Helen Koike <helen.fornazier@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * */ #include <linux/init.h> #include <linux/module.h> #include "vimc-common.h" /* * NOTE: non-bayer formats need to come first (necessary for enum_mbus_code * in the scaler) */ static const struct vimc_pix_map vimc_pix_map_list[] = { /* TODO: add all missing formats */ /* RGB formats */ { .code = MEDIA_BUS_FMT_BGR888_1X24, .pixelformat = V4L2_PIX_FMT_BGR24, .bpp = 3, .bayer = false, }, { .code = MEDIA_BUS_FMT_RGB888_1X24, .pixelformat = V4L2_PIX_FMT_RGB24, .bpp = 3, .bayer = false, }, { .code = MEDIA_BUS_FMT_ARGB8888_1X32, .pixelformat = V4L2_PIX_FMT_ARGB32, .bpp = 4, .bayer = false, }, /* Bayer formats */ { .code = MEDIA_BUS_FMT_SBGGR8_1X8, .pixelformat = V4L2_PIX_FMT_SBGGR8, .bpp = 1, .bayer = true, }, { .code = MEDIA_BUS_FMT_SGBRG8_1X8, .pixelformat = V4L2_PIX_FMT_SGBRG8, .bpp = 1, .bayer = true, }, { .code = MEDIA_BUS_FMT_SGRBG8_1X8, .pixelformat = V4L2_PIX_FMT_SGRBG8, .bpp = 1, .bayer = true, }, { .code = MEDIA_BUS_FMT_SRGGB8_1X8, .pixelformat = V4L2_PIX_FMT_SRGGB8, .bpp = 1, .bayer = true, }, { .code = MEDIA_BUS_FMT_SBGGR10_1X10, .pixelformat = V4L2_PIX_FMT_SBGGR10, .bpp = 2, .bayer = true, }, { .code = MEDIA_BUS_FMT_SGBRG10_1X10, .pixelformat = V4L2_PIX_FMT_SGBRG10, .bpp = 2, .bayer = true, }, { .code = MEDIA_BUS_FMT_SGRBG10_1X10, .pixelformat = V4L2_PIX_FMT_SGRBG10, .bpp = 2, .bayer = true, }, { .code = MEDIA_BUS_FMT_SRGGB10_1X10, .pixelformat = V4L2_PIX_FMT_SRGGB10, .bpp = 2, .bayer = true, }, /* 10bit raw bayer a-law compressed to 8 bits */ { .code = MEDIA_BUS_FMT_SBGGR10_ALAW8_1X8, .pixelformat = V4L2_PIX_FMT_SBGGR10ALAW8, .bpp = 1, .bayer = true, }, { .code = MEDIA_BUS_FMT_SGBRG10_ALAW8_1X8, .pixelformat = V4L2_PIX_FMT_SGBRG10ALAW8, .bpp = 1, .bayer = true, }, { .code = MEDIA_BUS_FMT_SGRBG10_ALAW8_1X8, .pixelformat = V4L2_PIX_FMT_SGRBG10ALAW8, .bpp = 1, .bayer = true, }, { .code = MEDIA_BUS_FMT_SRGGB10_ALAW8_1X8, .pixelformat = V4L2_PIX_FMT_SRGGB10ALAW8, .bpp = 1, .bayer = true, }, /* 10bit raw bayer DPCM compressed to 8 bits */ { .code = MEDIA_BUS_FMT_SBGGR10_DPCM8_1X8, .pixelformat = V4L2_PIX_FMT_SBGGR10DPCM8, .bpp = 1, .bayer = true, }, { .code = MEDIA_BUS_FMT_SGBRG10_DPCM8_1X8, .pixelformat = V4L2_PIX_FMT_SGBRG10DPCM8, .bpp = 1, .bayer = true, }, { .code = MEDIA_BUS_FMT_SGRBG10_DPCM8_1X8, .pixelformat = V4L2_PIX_FMT_SGRBG10DPCM8, .bpp = 1, .bayer = true, }, { .code = MEDIA_BUS_FMT_SRGGB10_DPCM8_1X8, .pixelformat = V4L2_PIX_FMT_SRGGB10DPCM8, .bpp = 1, .bayer = true, }, { .code = MEDIA_BUS_FMT_SBGGR12_1X12, .pixelformat = V4L2_PIX_FMT_SBGGR12, .bpp = 2, .bayer = true, }, { .code = MEDIA_BUS_FMT_SGBRG12_1X12, .pixelformat = V4L2_PIX_FMT_SGBRG12, .bpp = 2, .bayer = true, }, { .code = MEDIA_BUS_FMT_SGRBG12_1X12, .pixelformat = V4L2_PIX_FMT_SGRBG12, .bpp = 2, .bayer = true, }, { .code = MEDIA_BUS_FMT_SRGGB12_1X12, .pixelformat = V4L2_PIX_FMT_SRGGB12, .bpp = 2, .bayer = true, }, }; const struct vimc_pix_map *vimc_pix_map_by_index(unsigned int i) { if (i >= ARRAY_SIZE(vimc_pix_map_list)) return NULL; return &vimc_pix_map_list[i]; } EXPORT_SYMBOL_GPL(vimc_pix_map_by_index); const struct vimc_pix_map *vimc_pix_map_by_code(u32 code) { unsigned int i; for (i = 0; i < ARRAY_SIZE(vimc_pix_map_list); i++) { if (vimc_pix_map_list[i].code == code) return &vimc_pix_map_list[i]; } return NULL; } EXPORT_SYMBOL_GPL(vimc_pix_map_by_code); const struct vimc_pix_map *vimc_pix_map_by_pixelformat(u32 pixelformat) { unsigned int i; for (i = 0; i < ARRAY_SIZE(vimc_pix_map_list); i++) { if (vimc_pix_map_list[i].pixelformat == pixelformat) return &vimc_pix_map_list[i]; } return NULL; } EXPORT_SYMBOL_GPL(vimc_pix_map_by_pixelformat); /* Helper function to allocate and initialize pads */ struct media_pad *vimc_pads_init(u16 num_pads, const unsigned long *pads_flag) { struct media_pad *pads; unsigned int i; /* Allocate memory for the pads */ pads = kcalloc(num_pads, sizeof(*pads), GFP_KERNEL); if (!pads) return ERR_PTR(-ENOMEM); /* Initialize the pads */ for (i = 0; i < num_pads; i++) { pads[i].index = i; pads[i].flags = pads_flag[i]; } return pads; } EXPORT_SYMBOL_GPL(vimc_pads_init); int vimc_pipeline_s_stream(struct media_entity *ent, int enable) { struct v4l2_subdev *sd; struct media_pad *pad; unsigned int i; int ret; for (i = 0; i < ent->num_pads; i++) { if (ent->pads[i].flags & MEDIA_PAD_FL_SOURCE) continue; /* Start the stream in the subdevice direct connected */ pad = media_entity_remote_pad(&ent->pads[i]); if (!pad) continue; if (!is_media_entity_v4l2_subdev(pad->entity)) return -EINVAL; sd = media_entity_to_v4l2_subdev(pad->entity); ret = v4l2_subdev_call(sd, video, s_stream, enable); if (ret && ret != -ENOIOCTLCMD) return ret; } return 0; } EXPORT_SYMBOL_GPL(vimc_pipeline_s_stream); static int vimc_get_mbus_format(struct media_pad *pad, struct v4l2_subdev_format *fmt) { if (is_media_entity_v4l2_subdev(pad->entity)) { struct v4l2_subdev *sd = media_entity_to_v4l2_subdev(pad->entity); int ret; fmt->which = V4L2_SUBDEV_FORMAT_ACTIVE; fmt->pad = pad->index; ret = v4l2_subdev_call(sd, pad, get_fmt, NULL, fmt); if (ret) return ret; } else if (is_media_entity_v4l2_video_device(pad->entity)) { struct video_device *vdev = container_of(pad->entity, struct video_device, entity); struct vimc_ent_device *ved = video_get_drvdata(vdev); const struct vimc_pix_map *vpix; struct v4l2_pix_format vdev_fmt; if (!ved->vdev_get_format) return -ENOIOCTLCMD; ved->vdev_get_format(ved, &vdev_fmt); vpix = vimc_pix_map_by_pixelformat(vdev_fmt.pixelformat); v4l2_fill_mbus_format(&fmt->format, &vdev_fmt, vpix->code); } else { return -EINVAL; } return 0; } int vimc_link_validate(struct media_link *link) { struct v4l2_subdev_format source_fmt, sink_fmt; int ret; ret = vimc_get_mbus_format(link->source, &source_fmt); if (ret) return ret; ret = vimc_get_mbus_format(link->sink, &sink_fmt); if (ret) return ret; pr_info("vimc link validate: " "%s:src:%dx%d (0x%x, %d, %d, %d, %d) " "%s:snk:%dx%d (0x%x, %d, %d, %d, %d)\n", /* src */ link->source->entity->name, source_fmt.format.width, source_fmt.format.height, source_fmt.format.code, source_fmt.format.colorspace, source_fmt.format.quantization, source_fmt.format.xfer_func, source_fmt.format.ycbcr_enc, /* sink */ link->sink->entity->name, sink_fmt.format.width, sink_fmt.format.height, sink_fmt.format.code, sink_fmt.format.colorspace, sink_fmt.format.quantization, sink_fmt.format.xfer_func, sink_fmt.format.ycbcr_enc); /* The width, height and code must match. */ if (source_fmt.format.width != sink_fmt.format.width || source_fmt.format.height != sink_fmt.format.height || source_fmt.format.code != sink_fmt.format.code) return -EPIPE; /* * The field order must match, or the sink field order must be NONE * to support interlaced hardware connected to bridges that support * progressive formats only. */ if (source_fmt.format.field != sink_fmt.format.field && sink_fmt.format.field != V4L2_FIELD_NONE) return -EPIPE; /* * If colorspace is DEFAULT, then assume all the colorimetry is also * DEFAULT, return 0 to skip comparing the other colorimetry parameters */ if (source_fmt.format.colorspace == V4L2_COLORSPACE_DEFAULT || sink_fmt.format.colorspace == V4L2_COLORSPACE_DEFAULT) return 0; /* Colorspace must match. */ if (source_fmt.format.colorspace != sink_fmt.format.colorspace) return -EPIPE; /* Colorimetry must match if they are not set to DEFAULT */ if (source_fmt.format.ycbcr_enc != V4L2_YCBCR_ENC_DEFAULT && sink_fmt.format.ycbcr_enc != V4L2_YCBCR_ENC_DEFAULT && source_fmt.format.ycbcr_enc != sink_fmt.format.ycbcr_enc) return -EPIPE; if (source_fmt.format.quantization != V4L2_QUANTIZATION_DEFAULT && sink_fmt.format.quantization != V4L2_QUANTIZATION_DEFAULT && source_fmt.format.quantization != sink_fmt.format.quantization) return -EPIPE; if (source_fmt.format.xfer_func != V4L2_XFER_FUNC_DEFAULT && sink_fmt.format.xfer_func != V4L2_XFER_FUNC_DEFAULT && source_fmt.format.xfer_func != sink_fmt.format.xfer_func) return -EPIPE; return 0; } EXPORT_SYMBOL_GPL(vimc_link_validate); static const struct media_entity_operations vimc_ent_sd_mops = { .link_validate = vimc_link_validate, }; int vimc_ent_sd_register(struct vimc_ent_device *ved, struct v4l2_subdev *sd, struct v4l2_device *v4l2_dev, const char *const name, u32 function, u16 num_pads, const unsigned long *pads_flag, const struct v4l2_subdev_ops *sd_ops) { int ret; /* Allocate the pads */ ved->pads = vimc_pads_init(num_pads, pads_flag); if (IS_ERR(ved->pads)) return PTR_ERR(ved->pads); /* Fill the vimc_ent_device struct */ ved->ent = &sd->entity; /* Initialize the subdev */ v4l2_subdev_init(sd, sd_ops); sd->entity.function = function; sd->entity.ops = &vimc_ent_sd_mops; sd->owner = THIS_MODULE; strlcpy(sd->name, name, sizeof(sd->name)); v4l2_set_subdevdata(sd, ved); /* Expose this subdev to user space */ sd->flags |= V4L2_SUBDEV_FL_HAS_DEVNODE; if (sd->ctrl_handler) sd->flags |= V4L2_SUBDEV_FL_HAS_EVENTS; /* Initialize the media entity */ ret = media_entity_pads_init(&sd->entity, num_pads, ved->pads); if (ret) goto err_clean_pads; /* Register the subdev with the v4l2 and the media framework */ ret = v4l2_device_register_subdev(v4l2_dev, sd); if (ret) { dev_err(v4l2_dev->dev, "%s: subdev register failed (err=%d)\n", name, ret); goto err_clean_m_ent; } return 0; err_clean_m_ent: media_entity_cleanup(&sd->entity); err_clean_pads: vimc_pads_cleanup(ved->pads); return ret; } EXPORT_SYMBOL_GPL(vimc_ent_sd_register); void vimc_ent_sd_unregister(struct vimc_ent_device *ved, struct v4l2_subdev *sd) { v4l2_device_unregister_subdev(sd); media_entity_cleanup(ved->ent); vimc_pads_cleanup(ved->pads); } EXPORT_SYMBOL_GPL(vimc_ent_sd_unregister); MODULE_DESCRIPTION("Virtual Media Controller Driver (VIMC) Common"); MODULE_AUTHOR("Helen Koike <helen.fornazier@gmail.com>"); MODULE_LICENSE("GPL"); |
512 8630 8487 8634 35 14 8636 16 3 2 1 1 1 6 6 5 4 2 1 5 5 7 6 9 15 14 7 4 4 7 5 5 9 9 7 3 6 3 6 5 8 8 1 1 1 1 2 41 1 2 1 1 1 1 17 11 1 1 2 2 1 13 28 28 17 28 20 18 14 4 18 18 28 20 8 6 28 11 11 6 28 160 160 160 160 160 28 160 160 160 160 62 35 21 21 24 3 24 7 7 21 3 18 18 21 14 14 7 4 1 1 1 3 3 4 4 4 7 159 153 160 8 8 8 8 2 2 32 2 1 5 1 160 160 160 28 160 160 160 160 160 164 163 164 163 8 3 7 21 164 104 193 193 190 193 188 52 2 14 181 24 23 18 172 163 141 141 141 28 54 3 51 55 3 3 2 77 72 72 72 72 160 160 160 160 160 160 160 173 11 154 44 23 160 173 160 23 9 23 82 80 48 80 73 82 72 72 94 95 1 93 96 86 18 70 70 9 1 8 8 5 2 2 4 3 1 1 1 2 3 7 7 1 4 4 4 3 3 1 1 6 6 2 2 11 8 10 5 11 5 3 2 3 1 6 2 1 164 160 43 43 165 164 164 164 1 2 2 2 1 3 3 6 6 2 2 2 3 4 1 1 3 2 4 2 2 2 1 3 3 4 3 2 3 1 4 1 2 3 27 27 27 27 27 1 10 10 6 6 1 1 2 3 3 476 437 437 435 434 433 426 425 423 423 422 284 281 275 273 272 271 269 269 268 267 476 112 11 15 8 13 11 27 17 16 30 4 9 2 6 2 37 2 22 2 23 80 1 2 80 18 2 2 1 1 1 1 1 9 15 1 1 1 1 1 10 12 1 1 1 1 8 16 16 45 4 3 2 3 2 2 2 20 14 163 2 1 163 9 174 154 175 166 98 83 79 32 31 9 35 33 3 31 14 10 9 3 5 2 17 12 24 24 23 22 330 5 25 10 25 11 25 14 15 15 25 272 63 11 52 33 23 10 3 9 8 11 1 11 10 3 1 3 18 17 16 2 18 5 2 2 12 7 6 4 44 3 43 42 5 37 37 4 58 56 55 8 54 13 11 25 24 7 4 16 12 20 130 41 1 1 3 1 1 1 2 1 1 1 1 9 7 2 22 13 9 2 1 1 49 22 18 3 1 2 1 1 1 1 1 1 1 130 17 8 2 1 1 1 2 1 1 1 1 4 1 17 50 1 49 50 8 3 1 2 1 1 8 10 5 1 2 2 1 1 1 1 1 1 10 38 5 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 3 1 1 38 44 44 43 14 4 1 1 1 1 1 2 1 1 14 4 7 4 4 3 2 1 7 9 19 18 16 2 1 19 1 11 12 2 4 2 1 1 1 1 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 | /* * Linux Socket Filter - Kernel level socket filtering * * Based on the design of the Berkeley Packet Filter. The new * internal format has been designed by PLUMgrid: * * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com * * Authors: * * Jay Schulist <jschlst@samba.org> * Alexei Starovoitov <ast@plumgrid.com> * Daniel Borkmann <dborkman@redhat.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/sock_diag.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/gfp.h> #include <net/inet_common.h> #include <net/ip.h> #include <net/protocol.h> #include <net/netlink.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/flow_dissector.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <asm/unaligned.h> #include <asm/cmpxchg.h> #include <linux/filter.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> #include <linux/if_vlan.h> #include <linux/bpf.h> #include <net/sch_generic.h> #include <net/cls_cgroup.h> #include <net/dst_metadata.h> #include <net/dst.h> #include <net/sock_reuseport.h> #include <net/busy_poll.h> #include <net/tcp.h> #include <net/xfrm.h> #include <linux/bpf_trace.h> #include <net/xdp_sock.h> #include <linux/inetdevice.h> #include <net/ip_fib.h> #include <net/flow.h> #include <net/arp.h> #include <net/ipv6.h> #include <linux/seg6_local.h> #include <net/seg6.h> #include <net/seg6_local.h> /** * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter * @cap: limit on how short the eBPF program may trim the packet * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level * wrapper to BPF_PROG_RUN. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) { int err; struct sk_filter *filter; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); return -ENOMEM; } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); if (err) return err; err = security_sock_rcv_skb(sk, skb); if (err) return err; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { struct sock *save_sk = skb->sk; unsigned int pkt_len; skb->sk = sk; pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); return err; } EXPORT_SYMBOL(sk_filter_trim_cap); BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) { return skb_get_poff(skb); } BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; if (skb_is_nonlinear(skb)) return 0; if (skb->len < sizeof(struct nlattr)) return 0; if (a > skb->len - sizeof(struct nlattr)) return 0; nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); if (nla) return (void *) nla - (void *) skb->data; return 0; } BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; if (skb_is_nonlinear(skb)) return 0; if (skb->len < sizeof(struct nlattr)) return 0; if (a > skb->len - sizeof(struct nlattr)) return 0; nla = (struct nlattr *) &skb->data[a]; if (nla->nla_len > skb->len - a) return 0; nla = nla_find_nested(nla, x); if (nla) return (void *) nla - (void *) skb->data; return 0; } BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { u8 tmp, *ptr; const int len = sizeof(tmp); if (offset >= 0) { if (headlen - offset >= len) return *(u8 *)(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return tmp; } else { ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); if (likely(ptr)) return *(u8 *)ptr; } return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { u16 tmp, *ptr; const int len = sizeof(tmp); if (offset >= 0) { if (headlen - offset >= len) return get_unaligned_be16(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return be16_to_cpu(tmp); } else { ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); if (likely(ptr)) return get_unaligned_be16(ptr); } return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { u32 tmp, *ptr; const int len = sizeof(tmp); if (likely(offset >= 0)) { if (headlen - offset >= len) return get_unaligned_be32(data + offset); if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) return be32_to_cpu(tmp); } else { ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); if (likely(ptr)) return get_unaligned_be32(ptr); } return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, int, offset) { return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, offset); } BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); } static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { .func = bpf_get_raw_cpu_id, .gpl_only = false, .ret_type = RET_INTEGER, }; static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { struct bpf_insn *insn = insn_buf; switch (skb_field) { case SKF_AD_MARK: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, offsetof(struct sk_buff, mark)); break; case SKF_AD_PKTTYPE: *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); #endif break; case SKF_AD_QUEUE: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, queue_mapping)); break; case SKF_AD_VLAN_TAG: case SKF_AD_VLAN_TAG_PRESENT: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, offsetof(struct sk_buff, vlan_tci)); if (skb_field == SKF_AD_VLAN_TAG) { *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, ~VLAN_TAG_PRESENT); } else { /* dst_reg >>= 12 */ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); /* dst_reg &= 1 */ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); } break; } return insn - insn_buf; } static bool convert_bpf_extensions(struct sock_filter *fp, struct bpf_insn **insnp) { struct bpf_insn *insn = *insnp; u32 cnt; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); /* A = *(u16 *) (CTX + offsetof(protocol)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, protocol)); /* A = ntohs(A) [emitting a nop or swap16] */ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PKTTYPE: cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_IFINDEX: case SKF_AD_OFF + SKF_AD_HATYPE: BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, dev)); /* if (tmp != 0) goto pc + 1 */ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); *insn++ = BPF_EXIT_INSN(); if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, offsetof(struct net_device, ifindex)); else *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, offsetof(struct net_device, type)); break; case SKF_AD_OFF + SKF_AD_MARK: cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_RXHASH: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, hash)); break; case SKF_AD_OFF + SKF_AD_QUEUE: cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: cnt = convert_skb_access(SKF_AD_VLAN_TAG, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, BPF_REG_A, BPF_REG_CTX, insn); insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TPID: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, offsetof(struct sk_buff, vlan_proto)); /* A = ntohs(A) [emitting a nop or swap16] */ *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); break; case SKF_AD_OFF + SKF_AD_PAY_OFFSET: case SKF_AD_OFF + SKF_AD_NLATTR: case SKF_AD_OFF + SKF_AD_NLATTR_NEST: case SKF_AD_OFF + SKF_AD_CPU: case SKF_AD_OFF + SKF_AD_RANDOM: /* arg1 = CTX */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); /* arg2 = A */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); /* arg3 = X */ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); /* Emit call(arg1=CTX, arg2=A, arg3=X) */ switch (fp->k) { case SKF_AD_OFF + SKF_AD_PAY_OFFSET: *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); break; case SKF_AD_OFF + SKF_AD_NLATTR: *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); break; case SKF_AD_OFF + SKF_AD_NLATTR_NEST: *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); break; case SKF_AD_OFF + SKF_AD_CPU: *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); bpf_user_rnd_init_once(); break; } break; case SKF_AD_OFF + SKF_AD_ALU_XOR_X: /* A ^= X */ *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); break; default: /* This is just a dummy call to avoid letting the compiler * evict __bpf_call_base() as an optimization. Placed here * where no-one bothers. */ BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); return false; } *insnp = insn; return true; } static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) { const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); bool endian = BPF_SIZE(fp->code) == BPF_H || BPF_SIZE(fp->code) == BPF_W; bool indirect = BPF_MODE(fp->code) == BPF_IND; const int ip_align = NET_IP_ALIGN; struct bpf_insn *insn = *insnp; int offset = fp->k; if (!indirect && ((unaligned_ok && offset >= 0) || (!unaligned_ok && offset >= 0 && offset + ip_align >= 0 && offset + ip_align % size == 0))) { bool ldx_off_ok = offset <= S16_MAX; *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian + (!ldx_off_ok * 2)); if (ldx_off_ok) { *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, offset); } else { *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_TMP, 0); } if (endian) *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); *insn++ = BPF_JMP_A(8); } *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); } else { *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); if (fp->k) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); } switch (BPF_SIZE(fp->code)) { case BPF_B: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); break; case BPF_H: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); break; case BPF_W: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); break; default: return false; } *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *insn = BPF_EXIT_INSN(); *insnp = insn; return true; } /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, struct bpf_prog *new_prog, int *new_len, bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; struct sock_filter *fp; int *addrs = NULL; u8 bpf_src; BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); if (len <= 0 || len > BPF_MAXINSNS) return -EINVAL; if (new_prog) { first_insn = new_prog->insnsi; addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL | __GFP_NOWARN); if (!addrs) return -ENOMEM; } do_pass: new_insn = first_insn; fp = prog; /* Classic BPF related prologue emission. */ if (new_prog) { /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); if (*seen_ld_abs) { /* For packet access in classic BPF, cache skb->data * in callee-saved BPF R8 and skb->len - skb->data_len * (headlen) in BPF R9. Since classic BPF is read-only * on CTX, we only need to cache it once. */ *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), BPF_REG_D, BPF_REG_CTX, offsetof(struct sk_buff, data)); *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, offsetof(struct sk_buff, len)); *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, offsetof(struct sk_buff, data_len)); *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) addrs[i] = new_insn - first_insn; switch (fp->code) { /* All arithmetic insns and skb loads map as-is. */ case BPF_ALU | BPF_ADD | BPF_X: case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_SUB | BPF_X: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_AND | BPF_X: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_OR | BPF_X: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_X: case BPF_ALU | BPF_RSH | BPF_K: case BPF_ALU | BPF_XOR | BPF_X: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU | BPF_DIV | BPF_X: case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU | BPF_MOD | BPF_K: case BPF_ALU | BPF_NEG: case BPF_LD | BPF_ABS | BPF_W: case BPF_LD | BPF_ABS | BPF_H: case BPF_LD | BPF_ABS | BPF_B: case BPF_LD | BPF_IND | BPF_W: case BPF_LD | BPF_IND | BPF_H: case BPF_LD | BPF_IND | BPF_B: /* Check for overloaded BPF extension and * directly convert it if found, otherwise * just move on with mapping. */ if (BPF_CLASS(fp->code) == BPF_LD && BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; if (BPF_CLASS(fp->code) == BPF_LD && convert_bpf_ld_abs(fp, &insn)) { *seen_ld_abs = true; break; } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); /* Error with exception code on div/mod by 0. * For cBPF programs, this was always return 0. */ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); *insn++ = BPF_EXIT_INSN(); } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; /* Jump transformation cannot use BPF block macros * everywhere as offset calculation and target updates * require a bit more work than the rest, i.e. jump * opcodes map as-is, but offsets need adjustment. */ #define BPF_EMIT_JMP \ do { \ const s32 off_min = S16_MIN, off_max = S16_MAX; \ s32 off; \ \ if (target >= len || target < 0) \ goto err; \ off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ /* Adjust pc relative offset for 2nd or 3rd insn. */ \ off -= insn - tmp_insns; \ /* Reject anything not fitting into insn->off. */ \ if (off < off_min || off > off_max) \ goto err; \ insn->off = off; \ } while (0) case BPF_JMP | BPF_JA: target = i + fp->k + 1; insn->code = fp->code; BPF_EMIT_JMP; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { /* BPF immediates are signed, zero extend * immediate into tmp register and use it * in compare insn. */ *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); insn->dst_reg = BPF_REG_A; insn->src_reg = BPF_REG_TMP; bpf_src = BPF_X; } else { insn->dst_reg = BPF_REG_A; insn->imm = fp->k; bpf_src = BPF_SRC(fp->code); insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; } /* Common case where 'jump_false' is next insn. */ if (fp->jf == 0) { insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; target = i + fp->jt + 1; BPF_EMIT_JMP; break; } /* Convert some jumps when 'jump_true' is next insn. */ if (fp->jt == 0) { switch (BPF_OP(fp->code)) { case BPF_JEQ: insn->code = BPF_JMP | BPF_JNE | bpf_src; break; case BPF_JGT: insn->code = BPF_JMP | BPF_JLE | bpf_src; break; case BPF_JGE: insn->code = BPF_JMP | BPF_JLT | bpf_src; break; default: goto jmp_rest; } target = i + fp->jf + 1; BPF_EMIT_JMP; break; } jmp_rest: /* Other jumps are mapped into two insns: Jxx and JA. */ target = i + fp->jt + 1; insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; BPF_EMIT_JMP; insn++; insn->code = BPF_JMP | BPF_JA; target = i + fp->jf + 1; BPF_EMIT_JMP; break; /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ case BPF_LDX | BPF_MSH | BPF_B: { struct sock_filter tmp = { .code = BPF_LD | BPF_ABS | BPF_B, .k = fp->k, }; *seen_ld_abs = true; /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ convert_bpf_ld_abs(&tmp, &insn); insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); /* tmp = X */ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; } /* RET_K is remaped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ case BPF_RET | BPF_A: case BPF_RET | BPF_K: if (BPF_RVAL(fp->code) == BPF_K) *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 0, fp->k); *insn = BPF_EXIT_INSN(); break; /* Store to stack. */ case BPF_ST: case BPF_STX: stack_off = fp->k * 4 + 4; *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == BPF_ST ? BPF_REG_A : BPF_REG_X, -stack_off); /* check_load_and_stores() verifies that classic BPF can * load from stack only after write, so tracking * stack_depth for ST|STX insns is enough */ if (new_prog && new_prog->aux->stack_depth < stack_off) new_prog->aux->stack_depth = stack_off; break; /* Load from stack. */ case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: stack_off = fp->k * 4 + 4; *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_FP, -stack_off); break; /* A = K or X = K */ case BPF_LD | BPF_IMM: case BPF_LDX | BPF_IMM: *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, fp->k); break; /* X = A */ case BPF_MISC | BPF_TAX: *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); break; /* A = X */ case BPF_MISC | BPF_TXA: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); break; /* A = skb->len or X = skb->len */ case BPF_LD | BPF_W | BPF_LEN: case BPF_LDX | BPF_W | BPF_LEN: *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_CTX, offsetof(struct sk_buff, len)); break; /* Access seccomp_data fields. */ case BPF_LDX | BPF_ABS | BPF_W: /* A = *(u32 *) (ctx + K) */ *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); break; /* Unknown instruction. */ default: goto err; } insn++; if (new_prog) memcpy(new_insn, tmp_insns, sizeof(*insn) * (insn - tmp_insns)); new_insn += insn - tmp_insns; } if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; if (*seen_ld_abs) *new_len += 4; /* Prologue bits. */ return 0; } pass++; if (new_flen != new_insn - first_insn) { new_flen = new_insn - first_insn; if (pass > 2) goto err; goto do_pass; } kfree(addrs); BUG_ON(*new_len != new_flen); return 0; err: kfree(addrs); return -EINVAL; } /* Security: * * As we dont want to clear mem[] array for each packet going through * __bpf_prog_run(), we check that filter loaded by user never try to read * a cell if not previously written, and we check all branches to be sure * a malicious user doesn't try to abuse us. */ static int check_load_and_stores(const struct sock_filter *filter, int flen) { u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ int pc, ret = 0; BUILD_BUG_ON(BPF_MEMWORDS > 16); masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); if (!masks) return -ENOMEM; memset(masks, 0xff, flen * sizeof(*masks)); for (pc = 0; pc < flen; pc++) { memvalid &= masks[pc]; switch (filter[pc].code) { case BPF_ST: case BPF_STX: memvalid |= (1 << filter[pc].k); break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: if (!(memvalid & (1 << filter[pc].k))) { ret = -EINVAL; goto error; } break; case BPF_JMP | BPF_JA: /* A jump must set masks on target */ masks[pc + 1 + filter[pc].k] &= memvalid; memvalid = ~0; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: /* A jump must set masks on targets */ masks[pc + 1 + filter[pc].jt] &= memvalid; masks[pc + 1 + filter[pc].jf] &= memvalid; memvalid = ~0; break; } } error: kfree(masks); return ret; } static bool chk_code_allowed(u16 code_to_probe) { static const bool codes[] = { /* 32 bit ALU operations */ [BPF_ALU | BPF_ADD | BPF_K] = true, [BPF_ALU | BPF_ADD | BPF_X] = true, [BPF_ALU | BPF_SUB | BPF_K] = true, [BPF_ALU | BPF_SUB | BPF_X] = true, [BPF_ALU | BPF_MUL | BPF_K] = true, [BPF_ALU | BPF_MUL | BPF_X] = true, [BPF_ALU | BPF_DIV | BPF_K] = true, [BPF_ALU | BPF_DIV | BPF_X] = true, [BPF_ALU | BPF_MOD | BPF_K] = true, [BPF_ALU | BPF_MOD | BPF_X] = true, [BPF_ALU | BPF_AND | BPF_K] = true, [BPF_ALU | BPF_AND | BPF_X] = true, [BPF_ALU | BPF_OR | BPF_K] = true, [BPF_ALU | BPF_OR | BPF_X] = true, [BPF_ALU | BPF_XOR | BPF_K] = true, [BPF_ALU | BPF_XOR | BPF_X] = true, [BPF_ALU | BPF_LSH | BPF_K] = true, [BPF_ALU | BPF_LSH | BPF_X] = true, [BPF_ALU | BPF_RSH | BPF_K] = true, [BPF_ALU | BPF_RSH | BPF_X] = true, [BPF_ALU | BPF_NEG] = true, /* Load instructions */ [BPF_LD | BPF_W | BPF_ABS] = true, [BPF_LD | BPF_H | BPF_ABS] = true, [BPF_LD | BPF_B | BPF_ABS] = true, [BPF_LD | BPF_W | BPF_LEN] = true, [BPF_LD | BPF_W | BPF_IND] = true, [BPF_LD | BPF_H | BPF_IND] = true, [BPF_LD | BPF_B | BPF_IND] = true, [BPF_LD | BPF_IMM] = true, [BPF_LD | BPF_MEM] = true, [BPF_LDX | BPF_W | BPF_LEN] = true, [BPF_LDX | BPF_B | BPF_MSH] = true, [BPF_LDX | BPF_IMM] = true, [BPF_LDX | BPF_MEM] = true, /* Store instructions */ [BPF_ST] = true, [BPF_STX] = true, /* Misc instructions */ [BPF_MISC | BPF_TAX] = true, [BPF_MISC | BPF_TXA] = true, /* Return instructions */ [BPF_RET | BPF_K] = true, [BPF_RET | BPF_A] = true, /* Jump instructions */ [BPF_JMP | BPF_JA] = true, [BPF_JMP | BPF_JEQ | BPF_K] = true, [BPF_JMP | BPF_JEQ | BPF_X] = true, [BPF_JMP | BPF_JGE | BPF_K] = true, [BPF_JMP | BPF_JGE | BPF_X] = true, [BPF_JMP | BPF_JGT | BPF_K] = true, [BPF_JMP | BPF_JGT | BPF_X] = true, [BPF_JMP | BPF_JSET | BPF_K] = true, [BPF_JMP | BPF_JSET | BPF_X] = true, }; if (code_to_probe >= ARRAY_SIZE(codes)) return false; return codes[code_to_probe]; } static bool bpf_check_basics_ok(const struct sock_filter *filter, unsigned int flen) { if (filter == NULL) return false; if (flen == 0 || flen > BPF_MAXINSNS) return false; return true; } /** * bpf_check_classic - verify socket filter code * @filter: filter to verify * @flen: length of filter * * Check the user's filter code. If we let some ugly * filter code slip through kaboom! The filter must contain * no references or jumps that are out of range, no illegal * instructions, and must end with a RET instruction. * * All jumps are forward as they are not signed. * * Returns 0 if the rule set is legal or -EINVAL if not. */ static int bpf_check_classic(const struct sock_filter *filter, unsigned int flen) { bool anc_found; int pc; /* Check the filter code now */ for (pc = 0; pc < flen; pc++) { const struct sock_filter *ftest = &filter[pc]; /* May we actually operate on this code? */ if (!chk_code_allowed(ftest->code)) return -EINVAL; /* Some instructions need special checks */ switch (ftest->code) { case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_K: /* Check for division by zero */ if (ftest->k == 0) return -EINVAL; break; case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_K: if (ftest->k >= 32) return -EINVAL; break; case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: case BPF_ST: case BPF_STX: /* Check for invalid memory addresses */ if (ftest->k >= BPF_MEMWORDS) return -EINVAL; break; case BPF_JMP | BPF_JA: /* Note, the large ftest->k might cause loops. * Compare this with conditional jumps below, * where offsets are limited. --ANK (981016) */ if (ftest->k >= (unsigned int)(flen - pc - 1)) return -EINVAL; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: /* Both conditionals must be safe */ if (pc + ftest->jt + 1 >= flen || pc + ftest->jf + 1 >= flen) return -EINVAL; break; case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: anc_found = false; if (bpf_anc_helper(ftest) & BPF_ANC) anc_found = true; /* Ancillary operation unknown or unsupported */ if (anc_found == false && ftest->k >= SKF_AD_OFF) return -EINVAL; } } /* Last instruction must be a RET code */ switch (filter[flen - 1].code) { case BPF_RET | BPF_K: case BPF_RET | BPF_A: return check_load_and_stores(filter, flen); } return -EINVAL; } static int bpf_prog_store_orig_filter(struct bpf_prog *fp, const struct sock_fprog *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct sock_fprog_kern *fkprog; fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); if (!fp->orig_prog) return -ENOMEM; fkprog = fp->orig_prog; fkprog->len = fprog->len; fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL | __GFP_NOWARN); if (!fkprog->filter) { kfree(fp->orig_prog); return -ENOMEM; } return 0; } static void bpf_release_orig_filter(struct bpf_prog *fp) { struct sock_fprog_kern *fprog = fp->orig_prog; if (fprog) { kfree(fprog->filter); kfree(fprog); } } static void __bpf_prog_release(struct bpf_prog *prog) { if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); } else { bpf_release_orig_filter(prog); bpf_prog_free(prog); } } static void __sk_filter_release(struct sk_filter *fp) { __bpf_prog_release(fp->prog); kfree(fp); } /** * sk_filter_release_rcu - Release a socket filter by rcu_head * @rcu: rcu_head that contains the sk_filter to free */ static void sk_filter_release_rcu(struct rcu_head *rcu) { struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); __sk_filter_release(fp); } /** * sk_filter_release - release a socket filter * @fp: filter to remove * * Remove a filter from a socket and release its resources. */ static void sk_filter_release(struct sk_filter *fp) { if (refcount_dec_and_test(&fp->refcnt)) call_rcu(&fp->rcu, sk_filter_release_rcu); } void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); atomic_sub(filter_size, &sk->sk_omem_alloc); sk_filter_release(fp); } /* try to charge the socket memory if there is space available * return true on success */ static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); /* same check as in sock_kmalloc() */ if (filter_size <= sysctl_optmem_max && atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { atomic_add(filter_size, &sk->sk_omem_alloc); return true; } return false; } bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) { if (!refcount_inc_not_zero(&fp->refcnt)) return false; if (!__sk_filter_charge(sk, fp)) { sk_filter_release(fp); return false; } return true; } static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it * won't be used at this point in time anymore internally * after the migration to the internal BPF instruction * representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != sizeof(struct bpf_insn)); /* Conversion cannot happen on overlapping memory areas, * so we need to keep the user BPF around until the 2nd * pass. At this time, the user BPF is stored in fp->insns. */ old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), GFP_KERNEL | __GFP_NOWARN); if (!old_prog) { err = -ENOMEM; goto out_err; } /* 1st pass: calculate the new program length. */ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs); if (err) goto out_err_free; /* Expand fp for appending the new filter representation. */ old_fp = fp; fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); if (!fp) { /* The old_fp is still around in case we couldn't * allocate new memory, so uncharge on that one. */ fp = old_fp; err = -ENOMEM; goto out_err_free; } fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ err = bpf_convert_filter(old_prog, old_len, fp, &new_len, &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, * that at this time old_fp has already been released * by krealloc(). */ goto out_err_free; fp = bpf_prog_select_runtime(fp, &err); if (err) goto out_err_free; kfree(old_prog); return fp; out_err_free: kfree(old_prog); out_err: __bpf_prog_release(fp); return ERR_PTR(err); } static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, bpf_aux_classic_check_t trans) { int err; fp->bpf_func = NULL; fp->jited = 0; err = bpf_check_classic(fp->insns, fp->len); if (err) { __bpf_prog_release(fp); return ERR_PTR(err); } /* There might be additional checks and transformations * needed on classic filters, f.e. in case of seccomp. */ if (trans) { err = trans(fp->insns, fp->len); if (err) { __bpf_prog_release(fp); return ERR_PTR(err); } } /* Probe if we can JIT compile the filter and if so, do * the compilation of the filter. */ bpf_jit_compile(fp); /* JIT compiler couldn't process this filter, so do the * internal BPF translation for the optimized interpreter. */ if (!fp->jited) fp = bpf_migrate_filter(fp); return fp; } /** * bpf_prog_create - create an unattached filter * @pfp: the unattached filter that is created * @fprog: the filter program * * Create a filter independent of any socket. We first run some * sanity checks on it to make sure it does not explode on us later. * If an error occurs or there is insufficient memory for the filter * a negative errno code is returned. On success the return is zero. */ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; memcpy(fp->insns, fprog->filter, fsize); fp->len = fprog->len; /* Since unattached filters are not copied back to user * space through sk_get_filter(), we do not need to hold * a copy here, and can spare us the work. */ fp->orig_prog = NULL; /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ fp = bpf_prepare_filter(fp, NULL); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } EXPORT_SYMBOL_GPL(bpf_prog_create); /** * bpf_prog_create_from_user - create an unattached filter from user buffer * @pfp: the unattached filter that is created * @fprog: the filter program * @trans: post-classic verifier transformation handler * @save_orig: save classic BPF program * * This function effectively does the same as bpf_prog_create(), only * that it builds up its insns buffer from user space provided buffer. * It also allows for passing a bpf_aux_classic_check_t handler. */ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *fp; int err; /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return -EINVAL; fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!fp) return -ENOMEM; if (copy_from_user(fp->insns, fprog->filter, fsize)) { __bpf_prog_free(fp); return -EFAULT; } fp->len = fprog->len; fp->orig_prog = NULL; if (save_orig) { err = bpf_prog_store_orig_filter(fp, fprog); if (err) { __bpf_prog_free(fp); return -ENOMEM; } } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ fp = bpf_prepare_filter(fp, trans); if (IS_ERR(fp)) return PTR_ERR(fp); *pfp = fp; return 0; } EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); void bpf_prog_destroy(struct bpf_prog *fp) { __bpf_prog_release(fp); } EXPORT_SYMBOL_GPL(bpf_prog_destroy); static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) { struct sk_filter *fp, *old_fp; fp = kmalloc(sizeof(*fp), GFP_KERNEL); if (!fp) return -ENOMEM; fp->prog = prog; if (!__sk_filter_charge(sk, fp)) { kfree(fp); return -ENOMEM; } refcount_set(&fp->refcnt, 1); old_fp = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_filter, fp); if (old_fp) sk_filter_uncharge(sk, old_fp); return 0; } static struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { unsigned int fsize = bpf_classic_proglen(fprog); struct bpf_prog *prog; int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); /* Make sure new filter is there and in the right amounts. */ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) return ERR_PTR(-EINVAL); prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); if (!prog) return ERR_PTR(-ENOMEM); if (copy_from_user(prog->insns, fprog->filter, fsize)) { __bpf_prog_free(prog); return ERR_PTR(-EFAULT); } prog->len = fprog->len; err = bpf_prog_store_orig_filter(prog, fprog); if (err) { __bpf_prog_free(prog); return ERR_PTR(-ENOMEM); } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ return bpf_prepare_filter(prog, NULL); } /** * sk_attach_filter - attach a socket filter * @fprog: the filter program * @sk: the socket to use * * Attach the user's filter code. We first run some sanity checks on * it to make sure it does not explode on us later. If an error * occurs or there is insufficient memory for the filter a negative * errno code is returned. On success the return is zero. */ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { __bpf_prog_release(prog); return err; } return 0; } EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); if (bpf_prog_size(prog->len) > sysctl_optmem_max) err = -ENOMEM; else err = reuseport_attach_prog(sk, prog); if (err) __bpf_prog_release(prog); return err; } static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) { if (sock_flag(sk, SOCK_FILTER_LOCKED)) return ERR_PTR(-EPERM); return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); } int sk_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog = __get_bpf(ufd, sk); int err; if (IS_ERR(prog)) return PTR_ERR(prog); err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); return err; } return 0; } int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) { struct bpf_prog *prog; int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL) prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT); if (IS_ERR(prog)) return PTR_ERR(prog); if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) { /* Like other non BPF_PROG_TYPE_SOCKET_FILTER * bpf prog (e.g. sockmap). It depends on the * limitation imposed by bpf_prog_load(). * Hence, sysctl_optmem_max is not checked. */ if ((sk->sk_type != SOCK_STREAM && sk->sk_type != SOCK_DGRAM) || (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_TCP) || (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) { err = -ENOTSUPP; goto err_prog_put; } } else { /* BPF_PROG_TYPE_SOCKET_FILTER */ if (bpf_prog_size(prog->len) > sysctl_optmem_max) { err = -ENOMEM; goto err_prog_put; } } err = reuseport_attach_prog(sk, prog); err_prog_put: if (err) bpf_prog_put(prog); return err; } void sk_reuseport_prog_free(struct bpf_prog *prog) { if (!prog) return; if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) bpf_prog_put(prog); else bpf_prog_destroy(prog); } struct bpf_scratchpad { union { __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; u8 buff[MAX_BPF_STACK]; }; }; static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); static inline int __bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { return skb_ensure_writable(skb, write_len); } static inline int bpf_try_make_writable(struct sk_buff *skb, unsigned int write_len) { int err = __bpf_try_make_writable(skb, write_len); bpf_compute_data_pointers(skb); return err; } static int bpf_try_make_head_writable(struct sk_buff *skb) { return bpf_try_make_writable(skb, skb_headlen(skb)); } static inline void bpf_push_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); } static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) { if (skb_at_tc_ingress(skb)) skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); } BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len, u64, flags) { void *ptr; if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) return -EINVAL; if (unlikely(offset > INT_MAX)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; ptr = skb->data + offset; if (flags & BPF_F_RECOMPUTE_CSUM) __skb_postpull_rcsum(skb, ptr, len, offset); memcpy(ptr, from, len); if (flags & BPF_F_RECOMPUTE_CSUM) __skb_postpush_rcsum(skb, ptr, len, offset); if (flags & BPF_F_INVALIDATE_HASH) skb_clear_hash(skb); return 0; } static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .func = bpf_skb_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, void *, to, u32, len) { void *ptr; if (unlikely(offset > INT_MAX)) goto err_clear; ptr = skb_header_pointer(skb, offset, len, to); if (unlikely(!ptr)) goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .func = bpf_skb_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, u32, offset, void *, to, u32, len, u32, start_header) { u8 *end = skb_tail_pointer(skb); u8 *start, *ptr; if (unlikely(offset > 0xffff)) goto err_clear; switch (start_header) { case BPF_HDR_START_MAC: if (unlikely(!skb_mac_header_was_set(skb))) goto err_clear; start = skb_mac_header(skb); break; case BPF_HDR_START_NET: start = skb_network_header(skb); break; default: goto err_clear; } ptr = start + offset; if (likely(ptr + len <= end)) { memcpy(to, ptr, len); return 0; } err_clear: memset(to, 0, len); return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { .func = bpf_skb_load_bytes_relative, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write * test fail during runtime, we can pull in more data and redo * again, since implicitly, we invalidate previous checks here. * * Or, since we know how much we need to make read/writeable, * this can be done once at the program beginning for direct * access case. By this we overcome limitations of only current * headroom being accessible. */ return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); } static const struct bpf_func_proto bpf_skb_pull_data_proto = { .func = bpf_skb_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { int err = __bpf_try_make_writable(skb, write_len); bpf_compute_data_end_sk_skb(skb); return err; } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write * test fail during runtime, we can pull in more data and redo * again, since implicitly, we invalidate previous checks here. * * Or, since we know how much we need to make read/writeable, * this can be done once at the program beginning for direct * access case. By this we overcome limitations of only current * headroom being accessible. */ return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb)); } static const struct bpf_func_proto sk_skb_pull_data_proto = { .func = sk_skb_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { __sum16 *ptr; if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; ptr = (__sum16 *)(skb->data + offset); switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: if (unlikely(from != 0)) return -EINVAL; csum_replace_by_diff(ptr, to); break; case 2: csum_replace2(ptr, from, to); break; case 4: csum_replace4(ptr, from, to); break; default: return -EINVAL; } return 0; } static const struct bpf_func_proto bpf_l3_csum_replace_proto = { .func = bpf_l3_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, u64, from, u64, to, u64, flags) { bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; bool do_mforce = flags & BPF_F_MARK_ENFORCE; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) return -EFAULT; ptr = (__sum16 *)(skb->data + offset); if (is_mmzero && !do_mforce && !*ptr) return 0; switch (flags & BPF_F_HDR_FIELD_MASK) { case 0: if (unlikely(from != 0)) return -EINVAL; inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); break; case 4: inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); break; default: return -EINVAL; } if (is_mmzero && !*ptr) *ptr = CSUM_MANGLED_0; return 0; } static const struct bpf_func_proto bpf_l4_csum_replace_proto = { .func = bpf_l4_csum_replace, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, __be32 *, to, u32, to_size, __wsum, seed) { struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); u32 diff_size = from_size + to_size; int i, j = 0; /* This is quite flexible, some examples: * * from_size == 0, to_size > 0, seed := csum --> pushing data * from_size > 0, to_size == 0, seed := csum --> pulling data * from_size > 0, to_size > 0, seed := 0 --> diffing data * * Even for diffing, from_size and to_size don't need to be equal. */ if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || diff_size > sizeof(sp->diff))) return -EINVAL; for (i = 0; i < from_size / sizeof(__be32); i++, j++) sp->diff[j] = ~from[i]; for (i = 0; i < to_size / sizeof(__be32); i++, j++) sp->diff[j] = to[i]; return csum_partial(sp->diff, diff_size, seed); } static const struct bpf_func_proto bpf_csum_diff_proto = { .func = bpf_csum_diff, .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM_OR_NULL, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_MEM_OR_NULL, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) { /* The interface is to be used in combination with bpf_csum_diff() * for direct packet writes. csum rotation for alignment as well * as emulating csum_sub() can be done from the eBPF program. */ if (skb->ip_summed == CHECKSUM_COMPLETE) return (skb->csum = csum_add(skb->csum, csum)); return -ENOTSUPP; } static const struct bpf_func_proto bpf_csum_update_proto = { .func = bpf_csum_update, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); } static inline int __bpf_rx_skb_no_mac(struct net_device *dev, struct sk_buff *skb) { int ret = ____dev_forward_skb(dev, skb); if (likely(!ret)) { skb->dev = dev; ret = netif_rx(skb); } return ret; } static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; if (dev_xmit_recursion()) { net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); kfree_skb(skb); return -ENETDOWN; } skb->dev = dev; skb->tstamp = 0; dev_xmit_recursion_inc(); ret = dev_queue_xmit(skb); dev_xmit_recursion_dec(); return ret; } static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, u32 flags) { unsigned int mlen = skb_network_offset(skb); if (mlen) { __skb_pull(skb, mlen); /* At ingress, the mac header has already been pulled once. * At egress, skb_pospull_rcsum has to be done in case that * the skb is originated from ingress (i.e. a forwarded skb) * to ensure that rcsum starts at net header. */ if (!skb_at_tc_ingress(skb)) skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); } skb_pop_mac_header(skb); skb_reset_mac_len(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); } static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { /* Verify that a link layer header is carried */ if (unlikely(skb->mac_header >= skb->network_header)) { kfree_skb(skb); return -ERANGE; } bpf_push_mac_rcsum(skb); return flags & BPF_F_INGRESS ? __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); } static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, u32 flags) { if (dev_is_mac_header_xmit(dev)) return __bpf_redirect_common(skb, dev, flags); else return __bpf_redirect_no_mac(skb, dev, flags); } BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; struct sk_buff *clone; int ret; if (unlikely(flags & ~(BPF_F_INGRESS))) return -EINVAL; dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; clone = skb_clone(skb, GFP_ATOMIC); if (unlikely(!clone)) return -ENOMEM; /* For direct write, we need to keep the invariant that the skbs * we're dealing with need to be uncloned. Should uncloning fail * here, we need to free the just generated clone to unclone once * again. */ ret = bpf_try_make_head_writable(skb); if (unlikely(ret)) { kfree_skb(clone); return -ENOMEM; } return __bpf_redirect(clone, dev, flags); } static const struct bpf_func_proto bpf_clone_redirect_proto = { .func = bpf_clone_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags & ~(BPF_F_INGRESS))) return TC_ACT_SHOT; ri->ifindex = ifindex; ri->flags = flags; return TC_ACT_REDIRECT; } int skb_do_redirect(struct sk_buff *skb) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct net_device *dev; dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); ri->ifindex = 0; if (unlikely(!dev)) { kfree_skb(skb); return -EINVAL; } return __bpf_redirect(skb, dev, ri->flags); } static const struct bpf_func_proto bpf_redirect_proto = { .func = bpf_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); /* If user passes invalid input drop the packet. */ if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; tcb->bpf.flags = flags; tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); if (!tcb->bpf.sk_redir) return SK_DROP; return SK_PASS; } static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { .func = bpf_sk_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); /* If user passes invalid input drop the packet. */ if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; tcb->bpf.flags = flags; tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); if (!tcb->bpf.sk_redir) return SK_DROP; return SK_PASS; } struct sock *do_sk_redirect_map(struct sk_buff *skb) { struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); return tcb->bpf.sk_redir; } static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .func = bpf_sk_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, struct bpf_map *, map, void *, key, u64, flags) { /* If user passes invalid input drop the packet. */ if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; msg->flags = flags; msg->sk_redir = __sock_hash_lookup_elem(map, key); if (!msg->sk_redir) return SK_DROP; return SK_PASS; } static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { .func = bpf_msg_redirect_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, struct bpf_map *, map, u32, key, u64, flags) { /* If user passes invalid input drop the packet. */ if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; msg->flags = flags; msg->sk_redir = __sock_map_lookup_elem(map, key); if (!msg->sk_redir) return SK_DROP; return SK_PASS; } struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) { return msg->sk_redir; } static const struct bpf_func_proto bpf_msg_redirect_map_proto = { .func = bpf_msg_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes) { msg->apply_bytes = bytes; return 0; } static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { .func = bpf_msg_apply_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) { msg->cork_bytes = bytes; return 0; } static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { .func = bpf_msg_cork_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; #define sk_msg_iter_var(var) \ do { \ var++; \ if (var == MAX_SKB_FRAGS) \ var = 0; \ } while (0) BPF_CALL_4(bpf_msg_pull_data, struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) { unsigned int len = 0, offset = 0, copy = 0, poffset = 0; int bytes = end - start, bytes_sg_total; struct scatterlist *sg = msg->sg_data; int first_sg, last_sg, i, shift; unsigned char *p, *to, *from; struct page *page; if (unlikely(flags || end <= start)) return -EINVAL; /* First find the starting scatterlist element */ i = msg->sg_start; do { len = sg[i].length; if (start < offset + len) break; offset += len; sk_msg_iter_var(i); } while (i != msg->sg_end); if (unlikely(start >= offset + len)) return -EINVAL; first_sg = i; /* The start may point into the sg element so we need to also * account for the headroom. */ bytes_sg_total = start - offset + bytes; if (!msg->sg_copy[i] && bytes_sg_total <= len) goto out; /* At this point we need to linearize multiple scatterlist * elements or a single shared page. Either way we need to * copy into a linear buffer exclusively owned by BPF. Then * place the buffer in the scatterlist and fixup the original * entries by removing the entries now in the linear buffer * and shifting the remaining entries. For now we do not try * to copy partial entries to avoid complexity of running out * of sg_entry slots. The downside is reading a single byte * will copy the entire sg entry. */ do { copy += sg[i].length; sk_msg_iter_var(i); if (bytes_sg_total <= copy) break; } while (i != msg->sg_end); last_sg = i; if (unlikely(bytes_sg_total > copy)) return -EINVAL; page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, get_order(copy)); if (unlikely(!page)) return -ENOMEM; p = page_address(page); i = first_sg; do { from = sg_virt(&sg[i]); len = sg[i].length; to = p + poffset; memcpy(to, from, len); poffset += len; sg[i].length = 0; put_page(sg_page(&sg[i])); sk_msg_iter_var(i); } while (i != last_sg); sg[first_sg].length = copy; sg_set_page(&sg[first_sg], page, copy, 0); /* To repair sg ring we need to shift entries. If we only * had a single entry though we can just replace it and * be done. Otherwise walk the ring and shift the entries. */ WARN_ON_ONCE(last_sg == first_sg); shift = last_sg > first_sg ? last_sg - first_sg - 1 : MAX_SKB_FRAGS - first_sg + last_sg - 1; if (!shift) goto out; i = first_sg; sk_msg_iter_var(i); do { int move_from; if (i + shift >= MAX_SKB_FRAGS) move_from = i + shift - MAX_SKB_FRAGS; else move_from = i + shift; if (move_from == msg->sg_end) break; sg[i] = sg[move_from]; sg[move_from].length = 0; sg[move_from].page_link = 0; sg[move_from].offset = 0; sk_msg_iter_var(i); } while (1); msg->sg_end -= shift; if (msg->sg_end < 0) msg->sg_end += MAX_SKB_FRAGS; out: msg->data = sg_virt(&sg[first_sg]) + start - offset; msg->data_end = msg->data + bytes; return 0; } static const struct bpf_func_proto bpf_msg_pull_data_proto = { .func = bpf_msg_pull_data, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); } static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .func = bpf_get_cgroup_classid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) { return dst_tclassid(skb); } static const struct bpf_func_proto bpf_get_route_realm_proto = { .func = bpf_get_route_realm, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) { /* If skb_clear_hash() was called due to mangling, we can * trigger SW recalculation here. Later access to hash * can then use the inline skb->hash via context directly * instead of calling this helper again. */ return skb_get_hash(skb); } static const struct bpf_func_proto bpf_get_hash_recalc_proto = { .func = bpf_get_hash_recalc, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) { /* After all direct packet write, this can be used once for * triggering a lazy recalc on next skb_get_hash() invocation. */ skb_clear_hash(skb); return 0; } static const struct bpf_func_proto bpf_set_hash_invalid_proto = { .func = bpf_set_hash_invalid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) { /* Set user specified hash as L4(+), so that it gets returned * on skb_get_hash() call unless BPF prog later on triggers a * skb_clear_hash(). */ __skb_set_sw_hash(skb, hash, true); return 0; } static const struct bpf_func_proto bpf_set_hash_proto = { .func = bpf_set_hash, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, u16, vlan_tci) { int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && vlan_proto != htons(ETH_P_8021AD))) vlan_proto = htons(ETH_P_8021Q); bpf_push_mac_rcsum(skb); ret = skb_vlan_push(skb, vlan_proto, vlan_tci); bpf_pull_mac_rcsum(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_vlan_push_proto = { .func = bpf_skb_vlan_push, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { int ret; bpf_push_mac_rcsum(skb); ret = skb_vlan_pop(skb); bpf_pull_mac_rcsum(skb); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .func = bpf_skb_vlan_pop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { /* Caller already did skb_cow() with len as headroom, * so no need to do it here. */ skb_push(skb, len); memmove(skb->data, skb->data + len, off); memset(skb->data + off, 0, len); /* No skb_postpush_rcsum(skb, skb->data + off, len) * needed here as it does not change the skb->csum * result for checksum complete when summing over * zeroed blocks. */ return 0; } static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) { /* skb_ensure_writable() is not needed here, as we're * already working on an uncloned skb. */ if (unlikely(!pskb_may_pull(skb, off + len))) return -ENOMEM; skb_postpull_rcsum(skb, skb->data + off, len); memmove(skb->data + len, skb->data, off); __skb_pull(skb, len); return 0; } static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) { bool trans_same = skb->transport_header == skb->network_header; int ret; /* There's no need for __skb_push()/__skb_pull() pair to * get to the start of the mac header as we're guaranteed * to always start from here under eBPF. */ ret = bpf_skb_generic_push(skb, off, len); if (likely(!ret)) { skb->mac_header -= len; skb->network_header -= len; if (trans_same) skb->transport_header = skb->network_header; } return ret; } static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) { bool trans_same = skb->transport_header == skb->network_header; int ret; /* Same here, __skb_push()/__skb_pull() pair not needed. */ ret = bpf_skb_generic_pop(skb, off, len); if (likely(!ret)) { skb->mac_header += len; skb->network_header += len; if (trans_same) skb->transport_header = skb->network_header; } return ret; } static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* SKB_GSO_TCPV4 needs to be changed into * SKB_GSO_TCPV6. */ if (shinfo->gso_type & SKB_GSO_TCPV4) { shinfo->gso_type &= ~SKB_GSO_TCPV4; shinfo->gso_type |= SKB_GSO_TCPV6; } /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IPV6); skb_clear_hash(skb); return 0; } static int bpf_skb_proto_6_to_4(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); u32 off = skb_mac_header_len(skb); int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_pop(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* SKB_GSO_TCPV6 needs to be changed into * SKB_GSO_TCPV4. */ if (shinfo->gso_type & SKB_GSO_TCPV6) { shinfo->gso_type &= ~SKB_GSO_TCPV6; shinfo->gso_type |= SKB_GSO_TCPV4; } /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } skb->protocol = htons(ETH_P_IP); skb_clear_hash(skb); return 0; } static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) { __be16 from_proto = skb_protocol(skb, true); if (from_proto == htons(ETH_P_IP) && to_proto == htons(ETH_P_IPV6)) return bpf_skb_proto_4_to_6(skb); if (from_proto == htons(ETH_P_IPV6) && to_proto == htons(ETH_P_IP)) return bpf_skb_proto_6_to_4(skb); return -ENOTSUPP; } BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, u64, flags) { int ret; if (unlikely(flags)) return -EINVAL; /* General idea is that this helper does the basic groundwork * needed for changing the protocol, and eBPF program fills the * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() * and other helpers, rather than passing a raw buffer here. * * The rationale is to keep this minimal and without a need to * deal with raw packet data. F.e. even if we would pass buffers * here, the program still needs to call the bpf_lX_csum_replace() * helpers anyway. Plus, this way we keep also separation of * concerns, since f.e. bpf_skb_store_bytes() should only take * care of stores. * * Currently, additional options and extension header space are * not supported, but flags register is reserved so we can adapt * that. For offloads, we mark packet as dodgy, so that headers * need to be verified first. */ ret = bpf_skb_proto_xlat(skb, proto); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_proto_proto = { .func = bpf_skb_change_proto, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) { /* We only allow a restricted subset to be changed for now. */ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || !skb_pkt_type_ok(pkt_type))) return -EINVAL; skb->pkt_type = pkt_type; return 0; } static const struct bpf_func_proto bpf_skb_change_type_proto = { .func = bpf_skb_change_type, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static u32 bpf_skb_net_base_len(const struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): return sizeof(struct iphdr); case htons(ETH_P_IPV6): return sizeof(struct ipv6hdr); default: return ~0U; } } static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) { u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header grow, MSS needs to be downgraded. */ skb_decrease_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } return 0; } static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) { u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_pop(skb, off, len_diff); if (unlikely(ret < 0)) return ret; if (skb_is_gso(skb)) { struct skb_shared_info *shinfo = skb_shinfo(skb); /* Due to header shrink, MSS can be upgraded. */ skb_increase_gso_size(shinfo, len_diff); /* Header must be checked, and gso_segs recomputed. */ shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_segs = 0; } return 0; } #define BPF_SKB_MAX_LEN SKB_MAX_ALLOC static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) { bool trans_same = skb->transport_header == skb->network_header; u32 len_cur, len_diff_abs = abs(len_diff); u32 len_min = bpf_skb_net_base_len(skb); u32 len_max = BPF_SKB_MAX_LEN; __be16 proto = skb_protocol(skb, true); bool shrink = len_diff < 0; int ret; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; if (unlikely(proto != htons(ETH_P_IP) && proto != htons(ETH_P_IPV6))) return -ENOTSUPP; len_cur = skb->len - skb_network_offset(skb); if (skb_transport_header_was_set(skb) && !trans_same) len_cur = skb_network_header_len(skb); if ((shrink && (len_diff_abs >= len_cur || len_cur - len_diff_abs < len_min)) || (!shrink && (skb->len + len_diff_abs > len_max && !skb_is_gso(skb)))) return -ENOTSUPP; ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : bpf_skb_net_grow(skb, len_diff_abs); bpf_compute_data_pointers(skb); return ret; } BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32, mode, u64, flags) { if (unlikely(flags)) return -EINVAL; if (likely(mode == BPF_ADJ_ROOM_NET)) return bpf_skb_adjust_net(skb, len_diff); return -ENOTSUPP; } static const struct bpf_func_proto bpf_skb_adjust_room_proto = { .func = bpf_skb_adjust_room, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; static u32 __bpf_skb_min_len(const struct sk_buff *skb) { u32 min_len = skb_network_offset(skb); if (skb_transport_header_was_set(skb)) min_len = skb_transport_offset(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) min_len = skb_checksum_start_offset(skb) + skb->csum_offset + sizeof(__sum16); return min_len; } static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) { unsigned int old_len = skb->len; int ret; ret = __skb_grow_rcsum(skb, new_len); if (!ret) memset(skb->data + old_len, 0, new_len - old_len); return ret; } static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) { return __skb_trim_rcsum(skb, new_len); } static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 min_len = __bpf_skb_min_len(skb); int ret; if (unlikely(flags || new_len > max_len || new_len < min_len)) return -EINVAL; if (skb->encapsulation) return -ENOTSUPP; /* The basic idea of this helper is that it's performing the * needed work to either grow or trim an skb, and eBPF program * rewrites the rest via helpers like bpf_skb_store_bytes(), * bpf_lX_csum_replace() and others rather than passing a raw * buffer here. This one is a slow path helper and intended * for replies with control messages. * * Like in bpf_skb_change_proto(), we want to keep this rather * minimal and without protocol specifics so that we are able * to separate concerns as in bpf_skb_store_bytes() should only * be the one responsible for writing buffers. * * It's really expected to be a slow path operation here for * control message replies, so we're implicitly linearizing, * uncloning and drop offloads from the skb by this. */ ret = __bpf_try_make_writable(skb, skb->len); if (!ret) { if (new_len > skb->len) ret = bpf_skb_grow_rcsum(skb, new_len); else if (new_len < skb->len) ret = bpf_skb_trim_rcsum(skb, new_len); if (!ret && skb_is_gso(skb)) skb_gso_reset(skb); } return ret; } BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_tail_proto = { .func = bpf_skb_change_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { int ret = __bpf_skb_change_tail(skb, new_len, flags); bpf_compute_data_end_sk_skb(skb); return ret; } static const struct bpf_func_proto sk_skb_change_tail_proto = { .func = sk_skb_change_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || new_len < skb->len)) return -EINVAL; ret = skb_cow(skb, head_room); if (likely(!ret)) { /* Idea for this helper is that we currently only * allow to expand on mac header. This means that * skb->protocol network header, etc, stay as is. * Compared to bpf_skb_change_tail(), we're more * flexible due to not needing to linearize or * reset GSO. Intention for this helper is to be * used by an L3 skb that needs to push mac header * for redirection into L2 device. */ __skb_push(skb, head_room); memset(skb->data, 0, head_room); skb_reset_mac_header(skb); skb_reset_mac_len(skb); } return ret; } BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { int ret = __bpf_skb_change_head(skb, head_room, flags); bpf_compute_data_pointers(skb); return ret; } static const struct bpf_func_proto bpf_skb_change_head_proto = { .func = bpf_skb_change_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { int ret = __bpf_skb_change_head(skb, head_room, flags); bpf_compute_data_end_sk_skb(skb); return ret; } static const struct bpf_func_proto sk_skb_change_head_proto = { .func = sk_skb_change_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) { return xdp_data_meta_unsupported(xdp) ? 0 : xdp->data - xdp->data_meta; } BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); unsigned long metalen = xdp_get_metalen(xdp); void *data_start = xdp_frame_end + metalen; void *data = xdp->data + offset; if (unlikely(data < data_start || data > xdp->data_end - ETH_HLEN)) return -EINVAL; if (metalen) memmove(xdp->data_meta + offset, xdp->data_meta, metalen); xdp->data_meta += offset; xdp->data = data; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .func = bpf_xdp_adjust_head, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { void *data_end = xdp->data_end + offset; /* only shrinking is allowed for now. */ if (unlikely(offset >= 0)) return -EINVAL; if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; xdp->data_end = data_end; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { .func = bpf_xdp_adjust_tail, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) { void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); void *meta = xdp->data_meta + offset; unsigned long metalen = xdp->data - meta; if (xdp_data_meta_unsupported(xdp)) return -ENOTSUPP; if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; if (unlikely((metalen & (sizeof(__u32) - 1)) || (metalen > 32))) return -EACCES; xdp->data_meta = meta; return 0; } static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .func = bpf_xdp_adjust_meta, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; static int __bpf_tx_xdp(struct net_device *dev, struct bpf_map *map, struct xdp_buff *xdp, u32 index) { struct xdp_frame *xdpf; int err, sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); if (unlikely(err)) return err; xdpf = convert_to_xdp_frame(xdp); if (unlikely(!xdpf)) return -EOVERFLOW; sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH); if (sent <= 0) return sent; return 0; } static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, struct bpf_map *map, struct xdp_buff *xdp, u32 index) { int err; switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: { struct bpf_dtab_netdev *dst = fwd; err = dev_map_enqueue(dst, xdp, dev_rx); if (err) return err; __dev_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_CPUMAP: { struct bpf_cpu_map_entry *rcpu = fwd; err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (err) return err; __cpu_map_insert_ctx(map, index); break; } case BPF_MAP_TYPE_XSKMAP: { struct xdp_sock *xs = fwd; err = __xsk_map_redirect(map, xdp, xs); return err; } default: return -EBADRQC; } return 0; } void xdp_do_flush_map(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = ri->map_to_flush; ri->map_to_flush = NULL; if (map) { switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: __dev_map_flush(map); break; case BPF_MAP_TYPE_CPUMAP: __cpu_map_flush(map); break; case BPF_MAP_TYPE_XSKMAP: __xsk_map_flush(map); break; default: break; } } } EXPORT_SYMBOL_GPL(xdp_do_flush_map); static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) { switch (map->map_type) { case BPF_MAP_TYPE_DEVMAP: return __dev_map_lookup_elem(map, index); case BPF_MAP_TYPE_CPUMAP: return __cpu_map_lookup_elem(map, index); case BPF_MAP_TYPE_XSKMAP: return __xsk_map_lookup_elem(map, index); default: return NULL; } } void bpf_clear_redirect_map(struct bpf_map *map) { struct bpf_redirect_info *ri; int cpu; for_each_possible_cpu(cpu) { ri = per_cpu_ptr(&bpf_redirect_info, cpu); /* Avoid polluting remote cacheline due to writes if * not needed. Once we pass this test, we need the * cmpxchg() to make sure it hasn't been changed in * the meantime by remote CPU. */ if (unlikely(READ_ONCE(ri->map) == map)) cmpxchg(&ri->map, map, NULL); } } static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->ifindex; void *fwd = NULL; int err; ri->ifindex = 0; WRITE_ONCE(ri->map, NULL); fwd = __xdp_map_lookup_elem(map, index); if (!fwd) { err = -EINVAL; goto err; } if (ri->map_to_flush && ri->map_to_flush != map) xdp_do_flush_map(); err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); if (unlikely(err)) goto err; ri->map_to_flush = map; _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); return err; } int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = READ_ONCE(ri->map); struct net_device *fwd; u32 index = ri->ifindex; int err; if (map) return xdp_do_redirect_map(dev, xdp, xdp_prog, map); fwd = dev_get_by_index_rcu(dev_net(dev), index); ri->ifindex = 0; if (unlikely(!fwd)) { err = -EINVAL; goto err; } err = __bpf_tx_xdp(fwd, NULL, xdp, 0); if (unlikely(err)) goto err; _trace_xdp_redirect(dev, xdp_prog, index); return 0; err: _trace_xdp_redirect_err(dev, xdp_prog, index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_redirect); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, struct bpf_map *map) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); u32 index = ri->ifindex; void *fwd = NULL; int err = 0; ri->ifindex = 0; WRITE_ONCE(ri->map, NULL); fwd = __xdp_map_lookup_elem(map, index); if (unlikely(!fwd)) { err = -EINVAL; goto err; } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { struct bpf_dtab_netdev *dst = fwd; err = dev_map_generic_redirect(dst, skb, xdp_prog); if (unlikely(err)) goto err; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { struct xdp_sock *xs = fwd; err = xsk_generic_rcv(xs, xdp); if (err) goto err; consume_skb(skb); } else { /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); return 0; err: _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); return err; } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_map *map = READ_ONCE(ri->map); u32 index = ri->ifindex; struct net_device *fwd; int err = 0; if (map) return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, map); ri->ifindex = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); if (unlikely(!fwd)) { err = -EINVAL; goto err; } err = xdp_ok_fwd_dev(fwd, skb->len); if (unlikely(err)) goto err; skb->dev = fwd; _trace_xdp_redirect(dev, xdp_prog, index); generic_xdp_tx(skb, xdp_prog); return 0; err: _trace_xdp_redirect_err(dev, xdp_prog, index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; ri->ifindex = ifindex; ri->flags = flags; WRITE_ONCE(ri->map, NULL); return XDP_REDIRECT; } static const struct bpf_func_proto bpf_xdp_redirect_proto = { .func = bpf_xdp_redirect, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); if (unlikely(flags)) return XDP_ABORTED; ri->ifindex = ifindex; ri->flags = flags; WRITE_ONCE(ri->map, map); return XDP_REDIRECT; } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .func = bpf_xdp_redirect_map, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { void *ptr = skb_header_pointer(skb, off, len, dst_buff); if (unlikely(!ptr)) return len; if (ptr != dst_buff) memcpy(dst_buff, ptr, len); return 0; } BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(skb_size > skb->len)) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, bpf_skb_copy); } static const struct bpf_func_proto bpf_skb_event_output_proto = { .func = bpf_skb_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static unsigned short bpf_tunnel_key_af(u64 flags) { return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; } BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, u32, size, u64, flags) { const struct ip_tunnel_info *info = skb_tunnel_info(skb); u8 compat[sizeof(struct bpf_tunnel_key)]; void *to_orig = to; int err; if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { err = -EINVAL; goto err_clear; } if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { err = -EPROTO; goto err_clear; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) { err = -EINVAL; switch (size) { case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): goto set_compat; case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ if (ip_tunnel_info_af(info) != AF_INET) goto err_clear; set_compat: to = (struct bpf_tunnel_key *)compat; break; default: goto err_clear; } } to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, sizeof(to->remote_ipv6)); to->tunnel_label = be32_to_cpu(info->key.label); } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); to->tunnel_label = 0; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) memcpy(to_orig, to, size); return 0; err_clear: memset(to_orig, 0, size); return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .func = bpf_skb_get_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) { const struct ip_tunnel_info *info = skb_tunnel_info(skb); int err; if (unlikely(!info || !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { err = -ENOENT; goto err_clear; } if (unlikely(size < info->options_len)) { err = -ENOMEM; goto err_clear; } ip_tunnel_info_opts_get(to, info); if (size > info->options_len) memset(to + info->options_len, 0, size - info->options_len); return info->options_len; err_clear: memset(to, 0, size); return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { .func = bpf_skb_get_tunnel_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE, }; static struct metadata_dst __percpu *md_dst; BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, const struct bpf_tunnel_key *, from, u32, size, u64, flags) { struct metadata_dst *md = this_cpu_ptr(md_dst); u8 compat[sizeof(struct bpf_tunnel_key)]; struct ip_tunnel_info *info; if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER))) return -EINVAL; if (unlikely(size != sizeof(struct bpf_tunnel_key))) { switch (size) { case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): /* Fixup deprecated structure layouts here, so we have * a common path later on. */ memcpy(compat, from, size); memset(compat + size, 0, sizeof(compat) - size); from = (const struct bpf_tunnel_key *) compat; break; default: return -EINVAL; } } if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || from->tunnel_ext)) return -EINVAL; skb_dst_drop(skb); dst_hold((struct dst_entry *) md); skb_dst_set(skb, (struct dst_entry *) md); info = &md->u.tun_info; memset(info, 0, sizeof(*info)); info->mode = IP_TUNNEL_INFO_TX; info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; if (flags & BPF_F_DONT_FRAGMENT) info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; if (flags & BPF_F_ZERO_CSUM_TX) info->key.tun_flags &= ~TUNNEL_CSUM; if (flags & BPF_F_SEQ_NUMBER) info->key.tun_flags |= TUNNEL_SEQ; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; info->key.ttl = from->tunnel_ttl; if (flags & BPF_F_TUNINFO_IPV6) { info->mode |= IP_TUNNEL_INFO_IPV6; memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, sizeof(from->remote_ipv6)); info->key.label = cpu_to_be32(from->tunnel_label) & IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); } return 0; } static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .func = bpf_skb_set_tunnel_key, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, const u8 *, from, u32, size) { struct ip_tunnel_info *info = skb_tunnel_info(skb); const struct metadata_dst *md = this_cpu_ptr(md_dst); if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) return -EINVAL; if (unlikely(size > IP_TUNNEL_OPTS_MAX)) return -ENOMEM; ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT); return 0; } static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .func = bpf_skb_set_tunnel_opt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, }; static const struct bpf_func_proto * bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { struct metadata_dst __percpu *tmp; tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, METADATA_IP_TUNNEL, GFP_KERNEL); if (!tmp) return NULL; if (cmpxchg(&md_dst, NULL, tmp)) metadata_dst_free_percpu(tmp); } switch (which) { case BPF_FUNC_skb_set_tunnel_key: return &bpf_skb_set_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_opt: return &bpf_skb_set_tunnel_opt_proto; default: return NULL; } } BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); struct cgroup *cgrp; struct sock *sk; sk = skb_to_full_sk(skb); if (!sk || !sk_fullsock(sk)) return -ENOENT; if (unlikely(idx >= array->map.max_entries)) return -E2BIG; cgrp = READ_ONCE(array->ptrs[idx]); if (unlikely(!cgrp)) return -EAGAIN; return sk_under_cgroup_hierarchy(sk, cgrp); } static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .func = bpf_skb_under_cgroup, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; #ifdef CONFIG_SOCK_CGROUP_DATA BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); struct cgroup *cgrp; if (!sk || !sk_fullsock(sk)) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); return cgrp->kn->id.id; } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .func = bpf_skb_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, ancestor_level) { struct sock *sk = skb_to_full_sk(skb); struct cgroup *ancestor; struct cgroup *cgrp; if (!sk || !sk_fullsock(sk)) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) return 0; return ancestor->kn->id.id; } static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, unsigned long off, unsigned long len) { memcpy(dst_buff, src_buff + off, len); return 0; } BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, u64, flags, void *, meta, u64, meta_size) { u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) return -EINVAL; if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) return -EFAULT; return bpf_event_output(map, flags, meta, meta_size, xdp->data, xdp_size, bpf_xdp_copy); } static const struct bpf_func_proto bpf_xdp_event_output_proto = { .func = bpf_xdp_event_output, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) { return skb->sk ? sock_gen_cookie(skb->sk) : 0; } static const struct bpf_func_proto bpf_get_socket_cookie_proto = { .func = bpf_get_socket_cookie, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx) { return sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = { .func = bpf_get_socket_cookie_sock_addr, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return sock_gen_cookie(ctx->sk); } static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = { .func = bpf_get_socket_cookie_sock_ops, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); kuid_t kuid; if (!sk || !sk_fullsock(sk)) return overflowuid; kuid = sock_net_uid(sock_net(sk), sk); return from_kuid_munged(sock_net(sk)->user_ns, kuid); } static const struct bpf_func_proto bpf_get_socket_uid_proto = { .func = bpf_get_socket_uid, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { struct sock *sk = bpf_sock->sk; int ret = 0; int val; if (!sk_fullsock(sk)) return -EINVAL; if (level == SOL_SOCKET) { if (optlen != sizeof(int)) return -EINVAL; val = *((int *)optval); /* Only some socketops are supported */ switch (optname) { case SO_RCVBUF: val = min_t(u32, val, sysctl_rmem_max); sk->sk_userlocks |= SOCK_RCVBUF_LOCK; sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); break; case SO_SNDBUF: val = min_t(u32, val, sysctl_wmem_max); sk->sk_userlocks |= SOCK_SNDBUF_LOCK; sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); break; case SO_MAX_PACING_RATE: sk->sk_max_pacing_rate = val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); break; case SO_PRIORITY: sk->sk_priority = val; break; case SO_RCVLOWAT: if (val < 0) val = INT_MAX; sk->sk_rcvlowat = val ? : 1; break; case SO_MARK: if (sk->sk_mark != val) { sk->sk_mark = val; sk_dst_reset(sk); } break; default: ret = -EINVAL; } #ifdef CONFIG_INET } else if (level == SOL_IP) { if (optlen != sizeof(int) || sk->sk_family != AF_INET) return -EINVAL; val = *((int *)optval); /* Only some options are supported */ switch (optname) { case IP_TOS: if (val < -1 || val > 0xff) { ret = -EINVAL; } else { struct inet_sock *inet = inet_sk(sk); if (val == -1) val = 0; inet->tos = val; } break; default: ret = -EINVAL; } #if IS_ENABLED(CONFIG_IPV6) } else if (level == SOL_IPV6) { if (optlen != sizeof(int) || sk->sk_family != AF_INET6) return -EINVAL; val = *((int *)optval); /* Only some options are supported */ switch (optname) { case IPV6_TCLASS: if (val < -1 || val > 0xff) { ret = -EINVAL; } else { struct ipv6_pinfo *np = inet6_sk(sk); if (val == -1) val = 0; np->tclass = val; } break; default: ret = -EINVAL; } #endif } else if (level == SOL_TCP && sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; ret = tcp_set_congestion_control(sk, name, false, reinit, true); } else { struct tcp_sock *tp = tcp_sk(sk); if (optlen != sizeof(int)) return -EINVAL; val = *((int *)optval); /* Only some options are supported */ switch (optname) { case TCP_BPF_IW: if (val <= 0 || tp->data_segs_out > tp->syn_data) ret = -EINVAL; else tp->snd_cwnd = val; break; case TCP_BPF_SNDCWND_CLAMP: if (val <= 0) { ret = -EINVAL; } else { tp->snd_cwnd_clamp = val; tp->snd_ssthresh = val; } break; default: ret = -EINVAL; } } #endif } else { ret = -EINVAL; } return ret; } static const struct bpf_func_proto bpf_setsockopt_proto = { .func = bpf_setsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { struct sock *sk = bpf_sock->sk; if (!sk_fullsock(sk)) goto err_clear; #ifdef CONFIG_INET if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { if (optname == TCP_CONGESTION) { struct inet_connection_sock *icsk = inet_csk(sk); if (!icsk->icsk_ca_ops || optlen <= 1) goto err_clear; strncpy(optval, icsk->icsk_ca_ops->name, optlen); optval[optlen - 1] = 0; } else { goto err_clear; } } else if (level == SOL_IP) { struct inet_sock *inet = inet_sk(sk); if (optlen != sizeof(int) || sk->sk_family != AF_INET) goto err_clear; /* Only some options are supported */ switch (optname) { case IP_TOS: *((int *)optval) = (int)inet->tos; break; default: goto err_clear; } #if IS_ENABLED(CONFIG_IPV6) } else if (level == SOL_IPV6) { struct ipv6_pinfo *np = inet6_sk(sk); if (optlen != sizeof(int) || sk->sk_family != AF_INET6) goto err_clear; /* Only some options are supported */ switch (optname) { case IPV6_TCLASS: *((int *)optval) = (int)np->tclass; break; default: goto err_clear; } #endif } else { goto err_clear; } return 0; #endif err_clear: memset(optval, 0, optlen); return -EINVAL; } static const struct bpf_func_proto bpf_getsockopt_proto = { .func = bpf_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_UNINIT_MEM, .arg5_type = ARG_CONST_SIZE, }; BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, int, argval) { struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; if (val) tcp_sk(sk)->bpf_sock_ops_cb_flags = val; return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); } static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { .func = bpf_sock_ops_cb_flags_set, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; EXPORT_SYMBOL_GPL(ipv6_bpf_stub); BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, int, addr_len) { #ifdef CONFIG_INET struct sock *sk = ctx->sk; int err; /* Binding to port can be expensive so it's prohibited in the helper. * Only binding to IP is supported. */ err = -EINVAL; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; if (((struct sockaddr_in *)addr)->sin_port != htons(0)) return err; return __inet_bind(sk, addr, addr_len, true, false); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) return err; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_bind_proto = { .func = bpf_bind, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, }; #ifdef CONFIG_XFRM BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, struct bpf_xfrm_state *, to, u32, size, u64, flags) { const struct sec_path *sp = skb_sec_path(skb); const struct xfrm_state *x; if (!sp || unlikely(index >= sp->len || flags)) goto err_clear; x = sp->xvec[index]; if (unlikely(size != sizeof(struct bpf_xfrm_state))) goto err_clear; to->reqid = x->props.reqid; to->spi = x->id.spi; to->family = x->props.family; to->ext = 0; if (to->family == AF_INET6) { memcpy(to->remote_ipv6, x->props.saddr.a6, sizeof(to->remote_ipv6)); } else { to->remote_ipv4 = x->props.saddr.a4; memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); } return 0; err_clear: memset(to, 0, size); return -EINVAL; } static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { .func = bpf_skb_get_xfrm_state, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; #endif #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, const struct neighbour *neigh, const struct net_device *dev) { memcpy(params->dmac, neigh->ha, ETH_ALEN); memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; params->ifindex = dev->ifindex; return 0; } #endif #if IS_ENABLED(CONFIG_INET) static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { struct in_device *in_dev; struct neighbour *neigh; struct net_device *dev; struct fib_result res; struct fib_nh *nh; struct flowi4 fl4; int err; u32 mtu; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) return -ENODEV; /* verify forwarding is enabled on this interface */ in_dev = __in_dev_get_rcu(dev); if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl4.flowi4_iif = 1; fl4.flowi4_oif = params->ifindex; } else { fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.flowi4_proto = params->l4_protocol; fl4.daddr = params->ipv4_dst; fl4.saddr = params->ipv4_src; fl4.fl4_sport = params->sport; fl4.fl4_dport = params->dport; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib_table *tb; tb = fib_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { fl4.flowi4_mark = 0; fl4.flowi4_secid = 0; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_uid = sock_net_uid(net, NULL); err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); } if (err) { /* map fib lookup errors to RTN_ type */ if (err == -EINVAL) return BPF_FIB_LKUP_RET_BLACKHOLE; if (err == -EHOSTUNREACH) return BPF_FIB_LKUP_RET_UNREACHABLE; if (err == -EACCES) return BPF_FIB_LKUP_RET_PROHIBIT; return BPF_FIB_LKUP_RET_NOT_FWDED; } if (res.type != RTN_UNICAST) return BPF_FIB_LKUP_RET_NOT_FWDED; if (res.fi->fib_nhs > 1) fib_select_path(net, &res, &fl4, NULL); if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); if (params->tot_len > mtu) return BPF_FIB_LKUP_RET_FRAG_NEEDED; } nh = &res.fi->fib_nh[res.nh_sel]; /* do not handle lwt encaps right now */ if (nh->nh_lwtstate) return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nh->nh_dev; if (nh->nh_gw) params->ipv4_dst = nh->nh_gw; params->rt_metric = res.fi->fib_priority; /* xdp and cls_bpf programs are run in RCU-bh so * rcu_read_lock_bh is not needed here */ neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; return bpf_fib_set_fwd_params(params, neigh, dev); } #endif #if IS_ENABLED(CONFIG_IPV6) static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, u32 flags, bool check_mtu) { struct in6_addr *src = (struct in6_addr *) params->ipv6_src; struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; struct fib6_info *f6i; struct flowi6 fl6; int strict = 0; int oif; u32 mtu; /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) return BPF_FIB_LKUP_RET_NOT_FWDED; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) return -ENODEV; idev = __in6_dev_get_safely(dev); if (unlikely(!idev || !idev->cnf.forwarding)) return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl6.flowi6_iif = 1; oif = fl6.flowi6_oif = params->ifindex; } else { oif = fl6.flowi6_iif = params->ifindex; fl6.flowi6_oif = 0; strict = RT6_LOOKUP_F_HAS_SADDR; } fl6.flowlabel = params->flowinfo; fl6.flowi6_scope = 0; fl6.flowi6_flags = 0; fl6.mp_hash = 0; fl6.flowi6_proto = params->l4_protocol; fl6.daddr = *dst; fl6.saddr = *src; fl6.fl6_sport = params->sport; fl6.fl6_dport = params->dport; if (flags & BPF_FIB_LOOKUP_DIRECT) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; struct fib6_table *tb; tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) return BPF_FIB_LKUP_RET_NOT_FWDED; f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); } else { fl6.flowi6_mark = 0; fl6.flowi6_secid = 0; fl6.flowi6_tun_key.tun_id = 0; fl6.flowi6_uid = sock_net_uid(net, NULL); f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict); } if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) return BPF_FIB_LKUP_RET_NOT_FWDED; if (unlikely(f6i->fib6_flags & RTF_REJECT)) { switch (f6i->fib6_type) { case RTN_BLACKHOLE: return BPF_FIB_LKUP_RET_BLACKHOLE; case RTN_UNREACHABLE: return BPF_FIB_LKUP_RET_UNREACHABLE; case RTN_PROHIBIT: return BPF_FIB_LKUP_RET_PROHIBIT; default: return BPF_FIB_LKUP_RET_NOT_FWDED; } } if (f6i->fib6_type != RTN_UNICAST) return BPF_FIB_LKUP_RET_NOT_FWDED; if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, fl6.flowi6_oif, NULL, strict); if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); if (params->tot_len > mtu) return BPF_FIB_LKUP_RET_FRAG_NEEDED; } if (f6i->fib6_nh.nh_lwtstate) return BPF_FIB_LKUP_RET_UNSUPP_LWT; if (f6i->fib6_flags & RTF_GATEWAY) *dst = f6i->fib6_nh.nh_gw; dev = f6i->fib6_nh.nh_dev; params->rt_metric = f6i->fib6_metric; /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is * not needed here. Can not use __ipv6_neigh_lookup_noref here * because we need to get nd_tbl via the stub */ neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, ndisc_hashfn, dst, dev); if (!neigh) return BPF_FIB_LKUP_RET_NO_NEIGH; return bpf_fib_set_fwd_params(params, neigh, dev); } #endif BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, struct bpf_fib_lookup *, params, int, plen, u32, flags) { if (plen < sizeof(*params)) return -EINVAL; if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) return -EINVAL; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, flags, true); #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, flags, true); #endif } return -EAFNOSUPPORT; } static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { .func = bpf_xdp_fib_lookup, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { struct net *net = dev_net(skb->dev); int rc = -EAFNOSUPPORT; bool check_mtu = false; if (plen < sizeof(*params)) return -EINVAL; if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) return -EINVAL; if (params->tot_len) check_mtu = true; switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu); break; #endif } if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) { struct net_device *dev; /* When tot_len isn't provided by user, check skb * against MTU of FIB lookup resulting net_device */ dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; } return rc; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { .func = bpf_skb_fib_lookup, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_MEM, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) { int err; struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr; if (!seg6_validate_srh(srh, len)) return -EINVAL; switch (type) { case BPF_LWT_ENCAP_SEG6_INLINE: if (skb_protocol(skb, true) != htons(ETH_P_IPV6)) return -EBADMSG; err = seg6_do_srh_inline(skb, srh); break; case BPF_LWT_ENCAP_SEG6: skb_reset_inner_headers(skb); skb->encapsulation = 1; err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6); break; default: return -EINVAL; } bpf_compute_data_pointers(skb); if (err) return err; skb_set_transport_header(skb, sizeof(struct ipv6hdr)); return seg6_lookup_nexthop(skb, NULL, 0); } #endif /* CONFIG_IPV6_SEG6_BPF */ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_LWT_ENCAP_SEG6: case BPF_LWT_ENCAP_SEG6_INLINE: return bpf_push_seg6_encap(skb, type, hdr, len); #endif default: return -EINVAL; } } static const struct bpf_func_proto bpf_lwt_push_encap_proto = { .func = bpf_lwt_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM, .arg4_type = ARG_CONST_SIZE }; #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset, const void *, from, u32, len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_tlvs, *srh_end, *ptr; int srhoff = 0; if (srh == NULL) return -EINVAL; srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4)); srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (ptr >= srh_tlvs && ptr + len <= srh_end) srh_state->valid = false; else if (ptr < (void *)&srh->flags || ptr + len > (void *)&srh->segments) return -EFAULT; if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return -EINVAL; srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); memcpy(skb->data + offset, from, len); return 0; } static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .func = bpf_lwt_seg6_store_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM, .arg4_type = ARG_CONST_SIZE }; static void bpf_update_srh_state(struct sk_buff *skb) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); int srhoff = 0; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) { srh_state->srh = NULL; } else { srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen = srh_state->srh->hdrlen << 3; srh_state->valid = true; } } BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb, u32, action, void *, param, u32, param_len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); int hdroff = 0; int err; switch (action) { case SEG6_LOCAL_ACTION_END_X: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(struct in6_addr)) return -EINVAL; return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0); case SEG6_LOCAL_ACTION_END_T: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_DT6: if (!seg6_bpf_has_valid_srh(skb)) return -EBADMSG; if (param_len != sizeof(int)) return -EINVAL; if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0) return -EBADMSG; if (!pskb_pull(skb, hdroff)) return -EBADMSG; skb_postpull_rcsum(skb, skb_network_header(skb), hdroff); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; bpf_compute_data_pointers(skb); bpf_update_srh_state(skb); return seg6_lookup_nexthop(skb, NULL, *(int *)param); case SEG6_LOCAL_ACTION_END_B6: if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE, param, param_len); if (!err) bpf_update_srh_state(skb); return err; case SEG6_LOCAL_ACTION_END_B6_ENCAP: if (srh_state->srh && !seg6_bpf_has_valid_srh(skb)) return -EBADMSG; err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6, param, param_len); if (!err) bpf_update_srh_state(skb); return err; default: return -EINVAL; } } static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { .func = bpf_lwt_seg6_action, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM, .arg4_type = ARG_CONST_SIZE }; BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset, s32, len) { struct seg6_bpf_srh_state *srh_state = this_cpu_ptr(&seg6_bpf_srh_states); struct ipv6_sr_hdr *srh = srh_state->srh; void *srh_end, *srh_tlvs, *ptr; struct ipv6hdr *hdr; int srhoff = 0; int ret; if (unlikely(srh == NULL)) return -EINVAL; srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) + ((srh->first_segment + 1) << 4)); srh_end = (void *)((unsigned char *)srh + sizeof(*srh) + srh_state->hdrlen); ptr = skb->data + offset; if (unlikely(ptr < srh_tlvs || ptr > srh_end)) return -EFAULT; if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end)) return -EFAULT; if (len > 0) { ret = skb_cow_head(skb, len); if (unlikely(ret < 0)) return ret; ret = bpf_skb_net_hdr_push(skb, offset, len); } else { ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len); } bpf_compute_data_pointers(skb); if (unlikely(ret < 0)) return ret; hdr = (struct ipv6hdr *)skb->data; hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return -EINVAL; srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); srh_state->hdrlen += len; srh_state->valid = false; return 0; } static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { .func = bpf_lwt_seg6_adjust_srh, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; #endif /* CONFIG_IPV6_SEG6_BPF */ bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || func == bpf_skb_vlan_pop || func == bpf_skb_store_bytes || func == bpf_skb_change_proto || func == bpf_skb_change_head || func == sk_skb_change_head || func == bpf_skb_change_tail || func == sk_skb_change_tail || func == bpf_skb_adjust_room || func == bpf_skb_pull_data || func == sk_skb_pull_data || func == bpf_clone_redirect || func == bpf_l3_csum_replace || func == bpf_l4_csum_replace || func == bpf_xdp_adjust_head || func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || func == bpf_lwt_seg6_adjust_srh || func == bpf_lwt_seg6_action || #endif func == bpf_lwt_push_encap) return true; return false; } static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_map_lookup_elem: return &bpf_map_lookup_elem_proto; case BPF_FUNC_map_update_elem: return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_raw_smp_processor_id_proto; case BPF_FUNC_get_numa_node_id: return &bpf_get_numa_node_id_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); /* else: fall through */ default: return NULL; } } static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { /* inet and inet6 sockets are created in a process * context so there is always a valid uid/gid */ case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { /* inet and inet6 sockets are created in a process * context so there is always a valid uid/gid */ case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_bind: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: return &bpf_bind_proto; default: return NULL; } case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; default: return sk_filter_func_proto(func_id, prog); } } static const struct bpf_func_proto * tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; case BPF_FUNC_skb_vlan_push: return &bpf_skb_vlan_push_proto; case BPF_FUNC_skb_vlan_pop: return &bpf_skb_vlan_pop_proto; case BPF_FUNC_skb_change_proto: return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_change_type: return &bpf_skb_change_type_proto; case BPF_FUNC_skb_adjust_room: return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_skb_get_tunnel_opt: return &bpf_skb_get_tunnel_opt_proto; case BPF_FUNC_skb_set_tunnel_opt: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; case BPF_FUNC_set_hash: return &bpf_set_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; #endif #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; #endif default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; case BPF_FUNC_xdp_adjust_meta: return &bpf_xdp_adjust_meta_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; case BPF_FUNC_redirect_map: return &bpf_xdp_redirect_map_proto; case BPF_FUNC_xdp_adjust_tail: return &bpf_xdp_adjust_tail_proto; case BPF_FUNC_fib_lookup: return &bpf_xdp_fib_lookup_proto; default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_getsockopt_proto; case BPF_FUNC_sock_ops_cb_flags_set: return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; case BPF_FUNC_sock_hash_update: return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_msg_redirect_map: return &bpf_msg_redirect_map_proto; case BPF_FUNC_msg_redirect_hash: return &bpf_msg_redirect_hash_proto; case BPF_FUNC_msg_apply_bytes: return &bpf_msg_apply_bytes_proto; case BPF_FUNC_msg_cork_bytes: return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: return &sk_skb_pull_data_proto; case BPF_FUNC_skb_change_tail: return &sk_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &sk_skb_change_head_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_sk_redirect_map: return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; case BPF_FUNC_skb_pull_data: return &bpf_skb_pull_data_proto; case BPF_FUNC_csum_diff: return &bpf_csum_diff_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; case BPF_FUNC_get_hash_recalc: return &bpf_get_hash_recalc_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: return bpf_base_func_proto(func_id); } } static const struct bpf_func_proto * lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_lwt_push_encap: return &bpf_lwt_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_skb_get_tunnel_opt: return &bpf_skb_get_tunnel_opt_proto; case BPF_FUNC_skb_set_tunnel_opt: return bpf_get_skb_set_tunnel_proto(func_id); case BPF_FUNC_redirect: return &bpf_redirect_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_change_head: return &bpf_skb_change_head_proto; case BPF_FUNC_skb_store_bytes: return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: return &bpf_l4_csum_replace_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; default: return lwt_out_func_proto(func_id, prog); } } static const struct bpf_func_proto * lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) case BPF_FUNC_lwt_seg6_store_bytes: return &bpf_lwt_seg6_store_bytes_proto; case BPF_FUNC_lwt_seg6_action: return &bpf_lwt_seg6_action_proto; case BPF_FUNC_lwt_seg6_adjust_srh: return &bpf_lwt_seg6_adjust_srh_proto; #endif default: return lwt_out_func_proto(func_id, prog); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct __sk_buff)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) return false; break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { if (size != size_default) return false; } else { bpf_ctx_record_field_size(info, size_default); if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } } return true; } static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } /* Attach type specific accesses */ static bool __sock_filter_check_attach_type(int off, enum bpf_access_type access_type, enum bpf_attach_type attach_type) { switch (off) { case offsetof(struct bpf_sock, bound_dev_if): case offsetof(struct bpf_sock, mark): case offsetof(struct bpf_sock, priority): switch (attach_type) { case BPF_CGROUP_INET_SOCK_CREATE: goto full_access; default: return false; } case bpf_ctx_range(struct bpf_sock, src_ip4): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: goto read_only; default: return false; } case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): switch (attach_type) { case BPF_CGROUP_INET6_POST_BIND: goto read_only; default: return false; } case bpf_ctx_range(struct bpf_sock, src_port): switch (attach_type) { case BPF_CGROUP_INET4_POST_BIND: case BPF_CGROUP_INET6_POST_BIND: goto read_only; default: return false; } } read_only: return access_type == BPF_READ; full_access: return true; } static bool __sock_filter_check_size(int off, int size, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); switch (off) { case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); } return size == size_default; } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; if (!__sock_filter_check_attach_type(off, type, prog->expected_attach_type)) return false; if (!__sock_filter_check_size(off, size, info)) return false; return true; } static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { struct bpf_insn *insn = insn_buf; if (!direct_write) return 0; /* if (!skb->cloned) * goto start; * * (Fast-path, otherwise approximation that we might be * a clone, do the rest in helper.) */ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); /* ret = bpf_skb_pull_data(skb, 0); */ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_pull_data); /* if (!ret) * goto restore; * return TC_ACT_SHOT; */ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict); *insn++ = BPF_EXIT_INSN(); /* restore: */ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); /* start: */ *insn++ = prog->insnsi[0]; return insn - insn_buf; } static int bpf_gen_ld_abs(const struct bpf_insn *orig, struct bpf_insn *insn_buf) { bool indirect = BPF_MODE(orig->code) == BPF_IND; struct bpf_insn *insn = insn_buf; if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); } else { *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); if (orig->imm) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); } /* We're guaranteed here that CTX is in R6. */ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); switch (BPF_SIZE(orig->code)) { case BPF_B: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); break; case BPF_H: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); break; case BPF_W: *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); break; } *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); *insn++ = BPF_EXIT_INSN(); return insn - insn_buf; } static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); } static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_meta): info->reg_type = PTR_TO_PACKET_META; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) return false; if (off % size != 0) return false; if (size != sizeof(__u32)) return false; return true; } static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { if (bpf_prog_is_dev_bound(prog->aux)) { switch (off) { case offsetof(struct xdp_md, rx_queue_index): return __is_valid_xdp_access(off, size); } } return false; } switch (off) { case offsetof(struct xdp_md, data): info->reg_type = PTR_TO_PACKET; break; case offsetof(struct xdp_md, data_meta): info->reg_type = PTR_TO_PACKET_META; break; case offsetof(struct xdp_md, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return __is_valid_xdp_access(off, size); } void bpf_warn_invalid_xdp_action(u32 act) { const u32 act_max = XDP_REDIRECT; pr_warn_once("%s XDP return value %u, expect packet loss!\n", act > act_max ? "Illegal" : "Driver unsupported", act); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sock_addr)) return false; if (off % size != 0) return false; /* Disallow access to IPv6 fields from IPv4 contex and vise * versa. */ switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; default: return false; } break; case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; default: return false; } break; case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): switch (prog->expected_attach_type) { case BPF_CGROUP_UDP4_SENDMSG: break; default: return false; } break; case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): switch (prog->expected_attach_type) { case BPF_CGROUP_UDP6_SENDMSG: break; default: return false; } break; } switch (off) { case bpf_ctx_range(struct bpf_sock_addr, user_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): /* Only narrow read access allowed for now. */ if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); if (!bpf_ctx_narrow_access_ok(off, size, size_default)) return false; } else { if (size != size_default) return false; } break; case bpf_ctx_range(struct bpf_sock_addr, user_port): if (size != size_default) return false; break; default: if (type == BPF_READ) { if (size != size_default) return false; } else { return false; } } return true; } static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; if (type == BPF_WRITE) { switch (off) { case offsetof(struct bpf_sock_ops, reply): case offsetof(struct bpf_sock_ops, sk_txhash): if (size != size_default) return false; break; default: return false; } } else { switch (off) { case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, bytes_acked): if (size != sizeof(__u64)) return false; break; default: if (size != size_default) return false; break; } } return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog) { return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); } static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): return false; } if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_index): case bpf_ctx_range(struct __sk_buff, priority): break; default: return false; } } switch (off) { case bpf_ctx_range(struct __sk_buff, mark): return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; } return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sk_msg_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) return false; switch (off) { case offsetof(struct sk_msg_md, data): info->reg_type = PTR_TO_PACKET; if (size != sizeof(__u64)) return false; break; case offsetof(struct sk_msg_md, data_end): info->reg_type = PTR_TO_PACKET_END; if (size != sizeof(__u64)) return false; break; default: if (size != sizeof(__u32)) return false; } if (off < 0 || off >= sizeof(struct sk_msg_md)) return false; if (off % size != 0) return false; return true; } static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, len): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, len, 4, target_size)); break; case offsetof(struct __sk_buff, protocol): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, protocol, 2, target_size)); break; case offsetof(struct __sk_buff, vlan_proto): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_proto, 2, target_size)); break; case offsetof(struct __sk_buff, priority): if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, priority, 4, target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, priority, 4, target_size)); break; case offsetof(struct __sk_buff, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, skb_iif, 4, target_size)); break; case offsetof(struct __sk_buff, ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct net_device, ifindex, 4, target_size)); break; case offsetof(struct __sk_buff, hash): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, hash, 4, target_size)); break; case offsetof(struct __sk_buff, mark): if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, mark, 4, target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, mark, 4, target_size)); break; case offsetof(struct __sk_buff, pkt_type): *target_size = 1; *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, PKT_TYPE_OFFSET()); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); #ifdef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); #endif break; case offsetof(struct __sk_buff, queue_mapping): *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, queue_mapping, 2, target_size)); break; case offsetof(struct __sk_buff, vlan_present): case offsetof(struct __sk_buff, vlan_tci): BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, vlan_tci, 2, target_size)); if (si->off == offsetof(struct __sk_buff, vlan_tci)) { *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, ~VLAN_TAG_PRESENT); } else { *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); } break; case offsetof(struct __sk_buff, cb[0]) ... offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data)) % sizeof(__u64)); prog->cb_access = 1; off = si->off; off -= offsetof(struct __sk_buff, cb[0]); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, data); if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); else *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, tc_classid): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2); off = si->off; off -= offsetof(struct __sk_buff, tc_classid); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, tc_classid); *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, off); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), si->dst_reg, si->src_reg, offsetof(struct sk_buff, data)); break; case offsetof(struct __sk_buff, data_meta): off = si->off; off -= offsetof(struct __sk_buff, data_meta); off += offsetof(struct sk_buff, cb); off += offsetof(struct bpf_skb_data_end, data_meta); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); off += offsetof(struct sk_buff, cb); off += offsetof(struct bpf_skb_data_end, data_end); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, tc_index, 2, target_size)); #else *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, napi_id): #if defined(CONFIG_NET_RX_BUSY_POLL) *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, napi_id, 4, target_size)); *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else *target_size = 4; *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_family, 2, target_size)); break; case offsetof(struct __sk_buff, remote_ip4): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_daddr, 4, target_size)); break; case offsetof(struct __sk_buff, local_ip4): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, 4, target_size)); break; case offsetof(struct __sk_buff, remote_ip6[0]) ... offsetof(struct __sk_buff, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct __sk_buff, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, local_ip6[0]) ... offsetof(struct __sk_buff, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct __sk_buff, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct __sk_buff, remote_port): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_dport, 2, target_size)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct __sk_buff, local_port): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, bpf_target_off(struct sock_common, skc_num, 2, target_size)); break; } return insn - insn_buf; } static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_bound_dev_if)); break; case offsetof(struct bpf_sock, mark): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4); if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_mark)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_mark)); break; case offsetof(struct bpf_sock, priority): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4); if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_priority)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, sk_priority)); break; case offsetof(struct bpf_sock, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, offsetof(struct sock, sk_family)); break; case offsetof(struct bpf_sock, type): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); break; case offsetof(struct bpf_sock, protocol): *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; case offsetof(struct bpf_sock, src_ip4): *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_rcv_saddr, FIELD_SIZEOF(struct sock_common, skc_rcv_saddr), target_size)); break; case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; off -= offsetof(struct bpf_sock, src_ip6[0]); *insn++ = BPF_LDX_MEM( BPF_SIZE(si->code), si->dst_reg, si->src_reg, bpf_target_off( struct sock_common, skc_v6_rcv_saddr.s6_addr32[0], FIELD_SIZEOF(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]), target_size) + off); #else (void)off; *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock, src_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_num), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_num, FIELD_SIZEOF(struct sock_common, skc_num), target_size)); break; } return insn - insn_buf; } static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, bpf_target_off(struct net_device, ifindex, 4, target_size)); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); } return insn - insn_buf; } static u32 xdp_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct xdp_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data)); break; case offsetof(struct xdp_md, data_meta): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_meta)); break; case offsetof(struct xdp_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; case offsetof(struct xdp_md, ingress_ifindex): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct net_device, ifindex)); break; case offsetof(struct xdp_md, rx_queue_index): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), si->dst_reg, si->src_reg, offsetof(struct xdp_buff, rxq)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct xdp_rxq_info, queue_index)); break; } return insn - insn_buf; } /* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of * context Structure, F is Field in context structure that contains a pointer * to Nested Structure of type NS that has the field NF. * * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make * sure that SIZE is not greater than actual size of S.F.NF. * * If offset OFF is provided, the load happens from that offset relative to * offset of NF. */ #define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ do { \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ si->src_reg, offsetof(S, F)); \ *insn++ = BPF_LDX_MEM( \ SIZE, si->dst_reg, si->dst_reg, \ bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ target_size) \ + OFF); \ } while (0) #define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ BPF_FIELD_SIZEOF(NS, NF), 0) /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. * * It doesn't support SIZE argument though since narrow stores are not * supported for now. * * In addition it uses Temporary Field TF (member of struct S) as the 3rd * "register" since two registers available in convert_ctx_access are not * enough: we can't override neither SRC, since it contains value to store, nor * DST since it contains pointer to context that may be used by later * instructions. But we need a temporary place to save pointer to nested * structure whose field we want to store to. */ #define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ do { \ int tmp_reg = BPF_REG_9; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ --tmp_reg; \ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ --tmp_reg; \ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ offsetof(S, TF)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ si->dst_reg, offsetof(S, F)); \ *insn++ = BPF_STX_MEM( \ BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ target_size) \ + OFF); \ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ offsetof(S, TF)); \ } while (0) #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ TF) \ do { \ if (type == BPF_WRITE) { \ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ TF); \ } else { \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ S, NS, F, NF, SIZE, OFF); \ } \ } while (0) #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sockaddr, uaddr, sa_family); break; case offsetof(struct bpf_sock_addr, user_ip4): SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, sin_addr, BPF_SIZE(si->code), 0, tmp_reg); break; case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): off = si->off; off -= offsetof(struct bpf_sock_addr, user_ip6[0]); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; case offsetof(struct bpf_sock_addr, user_port): /* To get port we need to know sa_family first and then treat * sockaddr as either sockaddr_in or sockaddr_in6. * Though we can simplify since port field has same offset and * size in both structures. * Here we check this invariant and use just one of the * structures if it's true. */ BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) != FIELD_SIZEOF(struct sockaddr_in6, sin6_port)); SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, sin6_port, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, struct sock, sk, sk_family); break; case offsetof(struct bpf_sock_addr, type): SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sock, sk, __sk_flags_offset, BPF_W, 0); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); break; case offsetof(struct bpf_sock_addr, protocol): SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct sock, sk, __sk_flags_offset, BPF_W, 0); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; case offsetof(struct bpf_sock_addr, msg_src_ip4): /* Treat t_ctx as struct in_addr for msg_src_ip4. */ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct in_addr, t_ctx, s_addr, BPF_SIZE(si->code), 0, tmp_reg); break; case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): off = si->off; off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]); /* Treat t_ctx as struct in6_addr for msg_src_ip6. */ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( struct bpf_sock_addr_kern, struct in6_addr, t_ctx, s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg); break; } return insn - insn_buf; } static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct bpf_sock_ops, op) ... offsetof(struct bpf_sock_ops, replylong[3]): BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) != FIELD_SIZEOF(struct bpf_sock_ops_kern, op)); BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) != FIELD_SIZEOF(struct bpf_sock_ops_kern, reply)); BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) != FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong)); off = si->off; off -= offsetof(struct bpf_sock_ops, op); off += offsetof(struct bpf_sock_ops_kern, op); if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, off); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); break; case offsetof(struct bpf_sock_ops, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; case offsetof(struct bpf_sock_ops, remote_ip4): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; case offsetof(struct bpf_sock_ops, local_ip4): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); break; case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... offsetof(struct bpf_sock_ops, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock_ops, local_ip6[0]) ... offsetof(struct bpf_sock_ops, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct bpf_sock_ops, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct bpf_sock_ops, remote_port): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct bpf_sock_ops, local_port): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; case offsetof(struct bpf_sock_ops, is_fullsock): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, is_fullsock), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, is_fullsock)); break; case offsetof(struct bpf_sock_ops, state): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_state)); break; case offsetof(struct bpf_sock_ops, rtt_min): BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != sizeof(struct minmax)); BUILD_BUG_ON(sizeof(struct minmax) < sizeof(struct minmax_sample)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct bpf_sock_ops_kern, sk), si->dst_reg, si->src_reg, offsetof(struct bpf_sock_ops_kern, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct tcp_sock, rtt_min) + FIELD_SIZEOF(struct minmax_sample, t)); break; /* Helper macro for adding read access to tcp_sock or sock fields. */ #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ si->dst_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ OBJ_FIELD), \ si->dst_reg, si->dst_reg, \ offsetof(OBJ, OBJ_FIELD)); \ } while (0) /* Helper macro for adding write access to tcp_sock or sock fields. * The macro is called with two registers, dst_reg which contains a pointer * to ctx (context) and src_reg which contains the value that should be * stored. However, we need an additional register since we cannot overwrite * dst_reg because it may be used later in the program. * Instead we "borrow" one of the other register. We first save its value * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore * it at the end of the macro. */ #define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ do { \ int reg = BPF_REG_9; \ BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ if (si->dst_reg == reg || si->src_reg == reg) \ reg--; \ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ is_fullsock), \ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ is_fullsock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, sk));\ *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ reg, si->src_reg, \ offsetof(OBJ, OBJ_FIELD)); \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ temp)); \ } while (0) #define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ do { \ if (TYPE == BPF_WRITE) \ SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ else \ SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ } while (0) case offsetof(struct bpf_sock_ops, snd_cwnd): SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, srtt_us): SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, snd_ssthresh): SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, rcv_nxt): SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, snd_nxt): SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, snd_una): SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, mss_cache): SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, ecn_flags): SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, rate_delivered): SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, rate_interval_us): SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, packets_out): SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, retrans_out): SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, total_retrans): SOCK_OPS_GET_FIELD(total_retrans, total_retrans, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, segs_in): SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, data_segs_in): SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, segs_out): SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, data_segs_out): SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, lost_out): SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, sacked_out): SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, sk_txhash): SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; case offsetof(struct bpf_sock_ops, bytes_received): SOCK_OPS_GET_FIELD(bytes_received, bytes_received, struct tcp_sock); break; case offsetof(struct bpf_sock_ops, bytes_acked): SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); break; } return insn - insn_buf; } static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): off = si->off; off -= offsetof(struct __sk_buff, data_end); off += offsetof(struct sk_buff, cb); off += offsetof(struct tcp_skb_cb, bpf.data_end); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg, off); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, target_size); } return insn - insn_buf; } static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; #if IS_ENABLED(CONFIG_IPV6) int off; #endif switch (si->off) { case offsetof(struct sk_msg_md, data): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, data)); break; case offsetof(struct sk_msg_md, data_end): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, data_end)); break; case offsetof(struct sk_msg_md, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_family)); break; case offsetof(struct sk_msg_md, remote_ip4): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_daddr)); break; case offsetof(struct sk_msg_md, local_ip4): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_rcv_saddr)); break; case offsetof(struct sk_msg_md, remote_ip6[0]) ... offsetof(struct sk_msg_md, remote_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_v6_daddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct sk_msg_md, remote_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_daddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct sk_msg_md, local_ip6[0]) ... offsetof(struct sk_msg_md, local_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) != 4); off = si->off; off -= offsetof(struct sk_msg_md, local_ip6[0]); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_v6_rcv_saddr.s6_addr32[0]) + off); #else *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); #endif break; case offsetof(struct sk_msg_md, remote_port): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_dport)); #ifndef __BIG_ENDIAN_BITFIELD *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); #endif break; case offsetof(struct sk_msg_md, local_port): BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( struct sk_msg_buff, sk), si->dst_reg, si->src_reg, offsetof(struct sk_msg_buff, sk)); *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; } return insn - insn_buf; } const struct bpf_verifier_ops sk_filter_verifier_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops sk_filter_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, }; const struct bpf_prog_ops xdp_prog_ops = { .test_run = bpf_prog_test_run_xdp, }; const struct bpf_verifier_ops cg_skb_verifier_ops = { .get_func_proto = cg_skb_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops cg_skb_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_in_verifier_ops = { .get_func_proto = lwt_in_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_in_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_out_verifier_ops = { .get_func_proto = lwt_out_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_out_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_xmit_verifier_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, }; const struct bpf_prog_ops lwt_xmit_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { .get_func_proto = lwt_seg6local_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; const struct bpf_prog_ops lwt_seg6local_prog_ops = { .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops cg_sock_verifier_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, .convert_ctx_access = sock_filter_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_prog_ops = { }; const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { .get_func_proto = sock_addr_func_proto, .is_valid_access = sock_addr_is_valid_access, .convert_ctx_access = sock_addr_convert_ctx_access, }; const struct bpf_prog_ops cg_sock_addr_prog_ops = { }; const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, .convert_ctx_access = sock_ops_convert_ctx_access, }; const struct bpf_prog_ops sock_ops_prog_ops = { }; const struct bpf_verifier_ops sk_skb_verifier_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = sk_skb_convert_ctx_access, .gen_prologue = sk_skb_prologue, }; const struct bpf_prog_ops sk_skb_prog_ops = { }; const struct bpf_verifier_ops sk_msg_verifier_ops = { .get_func_proto = sk_msg_func_proto, .is_valid_access = sk_msg_is_valid_access, .convert_ctx_access = sk_msg_convert_ctx_access, }; const struct bpf_prog_ops sk_msg_prog_ops = { }; int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; struct sk_filter *filter; if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (filter) { RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); ret = 0; } return ret; } EXPORT_SYMBOL_GPL(sk_detach_filter); int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) { struct sock_fprog_kern *fprog; struct sk_filter *filter; int ret = 0; lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (!filter) goto out; /* We're copying the filter that has been originally attached, * so no conversion/decode needed anymore. eBPF programs that * have no original program cannot be dumped through this. */ ret = -EACCES; fprog = filter->prog->orig_prog; if (!fprog) goto out; ret = fprog->len; if (!len) /* User space only enquires number of filter blocks. */ goto out; ret = -EINVAL; if (len < fprog->len) goto out; ret = -EFAULT; if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number * of filter blocks. */ ret = fprog->len; out: release_sock(sk); return ret; } #ifdef CONFIG_INET struct sk_reuseport_kern { struct sk_buff *skb; struct sock *sk; struct sock *selected_sk; void *data_end; u32 hash; u32 reuseport_id; bool bind_inany; }; static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern, struct sock_reuseport *reuse, struct sock *sk, struct sk_buff *skb, u32 hash) { reuse_kern->skb = skb; reuse_kern->sk = sk; reuse_kern->selected_sk = NULL; reuse_kern->data_end = skb->data + skb_headlen(skb); reuse_kern->hash = hash; reuse_kern->reuseport_id = reuse->reuseport_id; reuse_kern->bind_inany = reuse->bind_inany; } struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, u32 hash) { struct sk_reuseport_kern reuse_kern; enum sk_action action; bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash); action = BPF_PROG_RUN(prog, &reuse_kern); if (action == SK_PASS) return reuse_kern.selected_sk; else return ERR_PTR(-ECONNREFUSED); } BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, struct bpf_map *, map, void *, key, u32, flags) { struct sock_reuseport *reuse; struct sock *selected_sk; selected_sk = map->ops->map_lookup_elem(map, key); if (!selected_sk) return -ENOENT; reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) /* selected_sk is unhashed (e.g. by close()) after the * above map_lookup_elem(). Treat selected_sk has already * been removed from the map. */ return -ENOENT; if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) { struct sock *sk; if (unlikely(!reuse_kern->reuseport_id)) /* There is a small race between adding the * sk to the map and setting the * reuse_kern->reuseport_id. * Treat it as the sk has not been added to * the bpf map yet. */ return -ENOENT; sk = reuse_kern->sk; if (sk->sk_protocol != selected_sk->sk_protocol) return -EPROTOTYPE; else if (sk->sk_family != selected_sk->sk_family) return -EAFNOSUPPORT; /* Catch all. Likely bound to a different sockaddr. */ return -EBADFD; } reuse_kern->selected_sk = selected_sk; return 0; } static const struct bpf_func_proto sk_select_reuseport_proto = { .func = sk_select_reuseport, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(sk_reuseport_load_bytes, const struct sk_reuseport_kern *, reuse_kern, u32, offset, void *, to, u32, len) { return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len); } static const struct bpf_func_proto sk_reuseport_load_bytes_proto = { .func = sk_reuseport_load_bytes, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; BPF_CALL_5(sk_reuseport_load_bytes_relative, const struct sk_reuseport_kern *, reuse_kern, u32, offset, void *, to, u32, len, u32, start_header) { return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to, len, start_header); } static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = { .func = sk_reuseport_load_bytes_relative, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; static const struct bpf_func_proto * sk_reuseport_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_sk_select_reuseport: return &sk_select_reuseport_proto; case BPF_FUNC_skb_load_bytes: return &sk_reuseport_load_bytes_proto; case BPF_FUNC_skb_load_bytes_relative: return &sk_reuseport_load_bytes_relative_proto; default: return bpf_base_func_proto(func_id); } } static bool sk_reuseport_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const u32 size_default = sizeof(__u32); if (off < 0 || off >= sizeof(struct sk_reuseport_md) || off % size || type != BPF_READ) return false; switch (off) { case offsetof(struct sk_reuseport_md, data): info->reg_type = PTR_TO_PACKET; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, data_end): info->reg_type = PTR_TO_PACKET_END; return size == sizeof(__u64); case offsetof(struct sk_reuseport_md, hash): return size == size_default; /* Fields that allow narrowing */ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol): if (size < FIELD_SIZEOF(struct sk_buff, protocol)) return false; /* fall through */ case bpf_ctx_range(struct sk_reuseport_md, ip_protocol): case bpf_ctx_range(struct sk_reuseport_md, bind_inany): case bpf_ctx_range(struct sk_reuseport_md, len): bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); default: return false; } } #define SK_REUSEPORT_LOAD_FIELD(F) ({ \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \ si->dst_reg, si->src_reg, \ bpf_target_off(struct sk_reuseport_kern, F, \ FIELD_SIZEOF(struct sk_reuseport_kern, F), \ target_size)); \ }) #define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \ struct sk_buff, \ skb, \ SKB_FIELD) #define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \ struct sock, \ sk, \ SK_FIELD, BPF_SIZE, EXTRA_OFF) static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct sk_reuseport_md, data): SK_REUSEPORT_LOAD_SKB_FIELD(data); break; case offsetof(struct sk_reuseport_md, len): SK_REUSEPORT_LOAD_SKB_FIELD(len); break; case offsetof(struct sk_reuseport_md, eth_protocol): SK_REUSEPORT_LOAD_SKB_FIELD(protocol); break; case offsetof(struct sk_reuseport_md, ip_protocol): BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset, BPF_W, 0); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian * aware. No further narrowing or masking is needed. */ *target_size = 1; break; case offsetof(struct sk_reuseport_md, data_end): SK_REUSEPORT_LOAD_FIELD(data_end); break; case offsetof(struct sk_reuseport_md, hash): SK_REUSEPORT_LOAD_FIELD(hash); break; case offsetof(struct sk_reuseport_md, bind_inany): SK_REUSEPORT_LOAD_FIELD(bind_inany); break; } return insn - insn_buf; } const struct bpf_verifier_ops sk_reuseport_verifier_ops = { .get_func_proto = sk_reuseport_func_proto, .is_valid_access = sk_reuseport_is_valid_access, .convert_ctx_access = sk_reuseport_convert_ctx_access, }; const struct bpf_prog_ops sk_reuseport_prog_ops = { }; #endif /* CONFIG_INET */ |
27 1 1 2 2 1 30 30 30 53 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 | /* * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef _TLS_OFFLOAD_H #define _TLS_OFFLOAD_H #include <linux/types.h> #include <asm/byteorder.h> #include <linux/crypto.h> #include <linux/socket.h> #include <linux/tcp.h> #include <net/tcp.h> #include <net/strparser.h> #include <uapi/linux/tls.h> /* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) #define TLS_HEADER_SIZE 5 #define TLS_NONCE_OFFSET TLS_HEADER_SIZE #define TLS_CRYPTO_INFO_READY(info) ((info)->cipher_type) #define TLS_RECORD_TYPE_DATA 0x17 #define TLS_AAD_SPACE_SIZE 13 #define TLS_DEVICE_NAME_MAX 32 /* * This structure defines the routines for Inline TLS driver. * The following routines are optional and filled with a * null pointer if not defined. * * @name: Its the name of registered Inline tls device * @dev_list: Inline tls device list * int (*feature)(struct tls_device *device); * Called to return Inline TLS driver capability * * int (*hash)(struct tls_device *device, struct sock *sk); * This function sets Inline driver for listen and program * device specific functioanlity as required * * void (*unhash)(struct tls_device *device, struct sock *sk); * This function cleans listen state set by Inline TLS driver */ struct tls_device { char name[TLS_DEVICE_NAME_MAX]; struct list_head dev_list; int (*feature)(struct tls_device *device); int (*hash)(struct tls_device *device, struct sock *sk); void (*unhash)(struct tls_device *device, struct sock *sk); }; enum { TLS_BASE, TLS_SW, #ifdef CONFIG_TLS_DEVICE TLS_HW, #endif TLS_HW_RECORD, TLS_NUM_CONFIG, }; struct tls_sw_context_tx { struct crypto_aead *aead_send; struct crypto_wait async_wait; char aad_space[TLS_AAD_SPACE_SIZE]; unsigned int sg_plaintext_size; int sg_plaintext_num_elem; struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS]; unsigned int sg_encrypted_size; int sg_encrypted_num_elem; struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS]; /* AAD | sg_plaintext_data | sg_tag */ struct scatterlist sg_aead_in[2]; /* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */ struct scatterlist sg_aead_out[2]; }; struct tls_sw_context_rx { struct crypto_aead *aead_recv; struct crypto_wait async_wait; struct strparser strp; void (*saved_data_ready)(struct sock *sk); unsigned int (*sk_poll)(struct file *file, struct socket *sock, struct poll_table_struct *wait); struct sk_buff *recv_pkt; u8 control; bool decrypted; }; struct tls_record_info { struct list_head list; u32 end_seq; int len; int num_frags; skb_frag_t frags[MAX_SKB_FRAGS]; }; struct tls_offload_context_tx { struct crypto_aead *aead_send; spinlock_t lock; /* protects records list */ struct list_head records_list; struct tls_record_info *open_record; struct tls_record_info *retransmit_hint; u64 hint_record_sn; u64 unacked_record_sn; struct scatterlist sg_tx_data[MAX_SKB_FRAGS]; void (*sk_destruct)(struct sock *sk); u8 driver_state[]; /* The TLS layer reserves room for driver specific state * Currently the belief is that there is not enough * driver specific state to justify another layer of indirection */ #define TLS_DRIVER_STATE_SIZE (max_t(size_t, 8, sizeof(void *))) }; #define TLS_OFFLOAD_CONTEXT_SIZE_TX \ (ALIGN(sizeof(struct tls_offload_context_tx), sizeof(void *)) + \ TLS_DRIVER_STATE_SIZE) enum { TLS_PENDING_CLOSED_RECORD }; enum tls_context_flags { TLS_RX_SYNC_RUNNING = 0, /* tls_dev_del was called for the RX side, device state was released, * but tls_ctx->netdev might still be kept, because TX-side driver * resources might not be released yet. Used to prevent the second * tls_dev_del call in tls_device_down if it happens simultaneously. */ TLS_RX_DEV_CLOSED = 2, }; struct cipher_context { u16 prepend_size; u16 tag_size; u16 overhead_size; u16 iv_size; char *iv; u16 rec_seq_size; char *rec_seq; }; union tls_crypto_context { struct tls_crypto_info info; struct tls12_crypto_info_aes_gcm_128 aes_gcm_128; }; struct tls_context { union tls_crypto_context crypto_send; union tls_crypto_context crypto_recv; struct list_head list; struct net_device *netdev; refcount_t refcount; void *priv_ctx_tx; void *priv_ctx_rx; u8 tx_conf:3; u8 rx_conf:3; struct cipher_context tx; struct cipher_context rx; struct scatterlist *partially_sent_record; u16 partially_sent_offset; unsigned long flags; bool in_tcp_sendpages; u16 pending_open_record_frags; int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); void (*sk_destruct)(struct sock *sk); void (*sk_proto_close)(struct sock *sk, long timeout); int (*setsockopt)(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); }; struct tls_offload_context_rx { /* sw must be the first member of tls_offload_context_rx */ struct tls_sw_context_rx sw; atomic64_t resync_req; u8 driver_state[]; /* The TLS layer reserves room for driver specific state * Currently the belief is that there is not enough * driver specific state to justify another layer of indirection */ }; #define TLS_OFFLOAD_CONTEXT_SIZE_RX \ (ALIGN(sizeof(struct tls_offload_context_rx), sizeof(void *)) + \ TLS_DRIVER_STATE_SIZE) void tls_ctx_free(struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); int tls_sk_query(struct sock *sk, int optname, char __user *optval, int __user *optlen); int tls_sk_attach(struct sock *sk, int optname, char __user *optval, unsigned int optlen); int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_sw_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_sw_close(struct sock *sk, long timeout); void tls_sw_free_resources_tx(struct sock *sk); void tls_sw_free_resources_rx(struct sock *sk); void tls_sw_release_resources_rx(struct sock *sk); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); unsigned int tls_sw_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); int tls_set_device_offload(struct sock *sk, struct tls_context *ctx); int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_device_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); void tls_device_sk_destruct(struct sock *sk); void tls_device_init(void); void tls_device_cleanup(void); struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn); static inline bool tls_record_is_start_marker(struct tls_record_info *rec) { return rec->len == 0; } static inline u32 tls_record_start_seq(struct tls_record_info *rec) { return rec->end_seq - rec->len; } void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); int tls_push_sg(struct sock *sk, struct tls_context *ctx, struct scatterlist *sg, u16 first_offset, int flags); int tls_push_pending_closed_record(struct sock *sk, struct tls_context *ctx, int flags, long *timeo); static inline bool tls_is_pending_closed_record(struct tls_context *ctx) { return test_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags); } static inline int tls_complete_pending_work(struct sock *sk, struct tls_context *ctx, int flags, long *timeo) { int rc = 0; if (unlikely(sk->sk_write_pending)) rc = wait_on_pending_writer(sk, timeo); if (!rc && tls_is_pending_closed_record(ctx)) rc = tls_push_pending_closed_record(sk, ctx, flags, timeo); return rc; } static inline bool tls_is_partially_sent_record(struct tls_context *ctx) { return !!ctx->partially_sent_record; } static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) { return tls_ctx->pending_open_record_frags; } struct sk_buff * tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, struct sk_buff *skb); static inline bool tls_is_sk_tx_device_offloaded(struct sock *sk) { #ifdef CONFIG_SOCK_VALIDATE_XMIT return sk_fullsock(sk) && (smp_load_acquire(&sk->sk_validate_xmit_skb) == &tls_validate_xmit_skb); #else return false; #endif } static inline void tls_err_abort(struct sock *sk, int err) { sk->sk_err = err; sk->sk_error_report(sk); } static inline bool tls_bigint_increment(unsigned char *seq, int len) { int i; for (i = len - 1; i >= 0; i--) { ++seq[i]; if (seq[i] != 0) break; } return (i == -1); } static inline void tls_advance_record_sn(struct sock *sk, struct cipher_context *ctx) { if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size)) tls_err_abort(sk, EBADMSG); tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, ctx->iv_size); } static inline void tls_fill_prepend(struct tls_context *ctx, char *buf, size_t plaintext_len, unsigned char record_type) { size_t pkt_len, iv_size = ctx->tx.iv_size; pkt_len = plaintext_len + iv_size + ctx->tx.tag_size; /* we cover nonce explicit here as well, so buf should be of * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE */ buf[0] = record_type; buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.info.version); buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.info.version); /* we can use IV for nonce explicit according to spec */ buf[3] = pkt_len >> 8; buf[4] = pkt_len & 0xFF; memcpy(buf + TLS_NONCE_OFFSET, ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size); } static inline void tls_make_aad(char *buf, size_t size, char *record_sequence, int record_sequence_size, unsigned char record_type) { memcpy(buf, record_sequence, record_sequence_size); buf[8] = record_type; buf[9] = TLS_1_2_VERSION_MAJOR; buf[10] = TLS_1_2_VERSION_MINOR; buf[11] = size >> 8; buf[12] = size & 0xFF; } static inline struct tls_context *tls_get_ctx(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); return icsk->icsk_ulp_data; } static inline struct tls_sw_context_rx *tls_sw_ctx_rx( const struct tls_context *tls_ctx) { return (struct tls_sw_context_rx *)tls_ctx->priv_ctx_rx; } static inline struct tls_sw_context_tx *tls_sw_ctx_tx( const struct tls_context *tls_ctx) { return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx; } static inline struct tls_offload_context_tx * tls_offload_ctx_tx(const struct tls_context *tls_ctx) { return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } static inline struct tls_offload_context_rx * tls_offload_ctx_rx(const struct tls_context *tls_ctx) { return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx; } /* The TLS context is valid until sk_destruct is called */ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); atomic64_set(&rx_ctx->resync_req, ((((uint64_t)seq) << 32) | 1)); } int tls_proccess_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); void tls_register_device(struct tls_device *device); void tls_unregister_device(struct tls_device *device); int tls_device_decrypted(struct sock *sk, struct sk_buff *skb); int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout); struct sk_buff *tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, struct sk_buff *skb); int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); void tls_device_offload_cleanup_rx(struct sock *sk); void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn); #endif /* _TLS_OFFLOAD_H */ |
976 626 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM signal #if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SIGNAL_H #include <linux/signal.h> #include <linux/sched.h> #include <linux/tracepoint.h> #define TP_STORE_SIGINFO(__entry, info) \ do { \ if (info == SEND_SIG_NOINFO || \ info == SEND_SIG_FORCED) { \ __entry->errno = 0; \ __entry->code = SI_USER; \ } else if (info == SEND_SIG_PRIV) { \ __entry->errno = 0; \ __entry->code = SI_KERNEL; \ } else { \ __entry->errno = info->si_errno; \ __entry->code = info->si_code; \ } \ } while (0) #ifndef TRACE_HEADER_MULTI_READ enum { TRACE_SIGNAL_DELIVERED, TRACE_SIGNAL_IGNORED, TRACE_SIGNAL_ALREADY_PENDING, TRACE_SIGNAL_OVERFLOW_FAIL, TRACE_SIGNAL_LOSE_INFO, }; #endif /** * signal_generate - called when a signal is generated * @sig: signal number * @info: pointer to struct siginfo * @task: pointer to struct task_struct * @group: shared or private * @result: TRACE_SIGNAL_* * * Current process sends a 'sig' signal to 'task' process with * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV, * 'info' is not a pointer and you can't access its field. Instead, * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV * means that si_code is SI_KERNEL. */ TRACE_EVENT(signal_generate, TP_PROTO(int sig, struct siginfo *info, struct task_struct *task, int group, int result), TP_ARGS(sig, info, task, group, result), TP_STRUCT__entry( __field( int, sig ) __field( int, errno ) __field( int, code ) __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, group ) __field( int, result ) ), TP_fast_assign( __entry->sig = sig; TP_STORE_SIGINFO(__entry, info); memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->pid = task->pid; __entry->group = group; __entry->result = result; ), TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d grp=%d res=%d", __entry->sig, __entry->errno, __entry->code, __entry->comm, __entry->pid, __entry->group, __entry->result) ); /** * signal_deliver - called when a signal is delivered * @sig: signal number * @info: pointer to struct siginfo * @ka: pointer to struct k_sigaction * * A 'sig' signal is delivered to current process with 'info' siginfo, * and it will be handled by 'ka'. ka->sa.sa_handler can be SIG_IGN or * SIG_DFL. * Note that some signals reported by signal_generate tracepoint can be * lost, ignored or modified (by debugger) before hitting this tracepoint. * This means, this can show which signals are actually delivered, but * matching generated signals and delivered signals may not be correct. */ TRACE_EVENT(signal_deliver, TP_PROTO(int sig, struct siginfo *info, struct k_sigaction *ka), TP_ARGS(sig, info, ka), TP_STRUCT__entry( __field( int, sig ) __field( int, errno ) __field( int, code ) __field( unsigned long, sa_handler ) __field( unsigned long, sa_flags ) ), TP_fast_assign( __entry->sig = sig; TP_STORE_SIGINFO(__entry, info); __entry->sa_handler = (unsigned long)ka->sa.sa_handler; __entry->sa_flags = ka->sa.sa_flags; ), TP_printk("sig=%d errno=%d code=%d sa_handler=%lx sa_flags=%lx", __entry->sig, __entry->errno, __entry->code, __entry->sa_handler, __entry->sa_flags) ); #endif /* _TRACE_SIGNAL_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
88 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * ebtables * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * ebtables.c,v 2.0, April, 2002 * * This code is strongly inspired by the iptables code which is * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling */ #ifndef _UAPI__LINUX_BRIDGE_EFF_H #define _UAPI__LINUX_BRIDGE_EFF_H #include <linux/types.h> #include <linux/if.h> #include <linux/netfilter_bridge.h> #define EBT_TABLE_MAXNAMELEN 32 #define EBT_CHAIN_MAXNAMELEN EBT_TABLE_MAXNAMELEN #define EBT_FUNCTION_MAXNAMELEN EBT_TABLE_MAXNAMELEN #define EBT_EXTENSION_MAXNAMELEN 31 /* verdicts >0 are "branches" */ #define EBT_ACCEPT -1 #define EBT_DROP -2 #define EBT_CONTINUE -3 #define EBT_RETURN -4 #define NUM_STANDARD_TARGETS 4 /* ebtables target modules store the verdict inside an int. We can * reclaim a part of this int for backwards compatible extensions. * The 4 lsb are more than enough to store the verdict. */ #define EBT_VERDICT_BITS 0x0000000F struct xt_match; struct xt_target; struct ebt_counter { __u64 pcnt; __u64 bcnt; }; struct ebt_replace { char name[EBT_TABLE_MAXNAMELEN]; unsigned int valid_hooks; /* nr of rules in the table */ unsigned int nentries; /* total size of the entries */ unsigned int entries_size; /* start of the chains */ struct ebt_entries __user *hook_entry[NF_BR_NUMHOOKS]; /* nr of counters userspace expects back */ unsigned int num_counters; /* where the kernel will put the old counters */ struct ebt_counter __user *counters; char __user *entries; }; struct ebt_replace_kernel { char name[EBT_TABLE_MAXNAMELEN]; unsigned int valid_hooks; /* nr of rules in the table */ unsigned int nentries; /* total size of the entries */ unsigned int entries_size; /* start of the chains */ struct ebt_entries *hook_entry[NF_BR_NUMHOOKS]; /* nr of counters userspace expects back */ unsigned int num_counters; /* where the kernel will put the old counters */ struct ebt_counter *counters; char *entries; }; struct ebt_entries { /* this field is always set to zero * See EBT_ENTRY_OR_ENTRIES. * Must be same size as ebt_entry.bitmask */ unsigned int distinguisher; /* the chain name */ char name[EBT_CHAIN_MAXNAMELEN]; /* counter offset for this chain */ unsigned int counter_offset; /* one standard (accept, drop, return) per hook */ int policy; /* nr. of entries */ unsigned int nentries; /* entry list */ char data[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; /* used for the bitmask of struct ebt_entry */ /* This is a hack to make a difference between an ebt_entry struct and an * ebt_entries struct when traversing the entries from start to end. * Using this simplifies the code a lot, while still being able to use * ebt_entries. * Contrary, iptables doesn't use something like ebt_entries and therefore uses * different techniques for naming the policy and such. So, iptables doesn't * need a hack like this. */ #define EBT_ENTRY_OR_ENTRIES 0x01 /* these are the normal masks */ #define EBT_NOPROTO 0x02 #define EBT_802_3 0x04 #define EBT_SOURCEMAC 0x08 #define EBT_DESTMAC 0x10 #define EBT_F_MASK (EBT_NOPROTO | EBT_802_3 | EBT_SOURCEMAC | EBT_DESTMAC \ | EBT_ENTRY_OR_ENTRIES) #define EBT_IPROTO 0x01 #define EBT_IIN 0x02 #define EBT_IOUT 0x04 #define EBT_ISOURCE 0x8 #define EBT_IDEST 0x10 #define EBT_ILOGICALIN 0x20 #define EBT_ILOGICALOUT 0x40 #define EBT_INV_MASK (EBT_IPROTO | EBT_IIN | EBT_IOUT | EBT_ILOGICALIN \ | EBT_ILOGICALOUT | EBT_ISOURCE | EBT_IDEST) struct ebt_entry_match { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; uint8_t revision; }; struct xt_match *match; } u; /* size of data */ unsigned int match_size; unsigned char data[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; struct ebt_entry_watcher { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; uint8_t revision; }; struct xt_target *watcher; } u; /* size of data */ unsigned int watcher_size; unsigned char data[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; struct ebt_entry_target { union { struct { char name[EBT_EXTENSION_MAXNAMELEN]; uint8_t revision; }; struct xt_target *target; } u; /* size of data */ unsigned int target_size; unsigned char data[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; #define EBT_STANDARD_TARGET "standard" struct ebt_standard_target { struct ebt_entry_target target; int verdict; }; /* one entry */ struct ebt_entry { /* this needs to be the first field */ unsigned int bitmask; unsigned int invflags; __be16 ethproto; /* the physical in-dev */ char in[IFNAMSIZ]; /* the logical in-dev */ char logical_in[IFNAMSIZ]; /* the physical out-dev */ char out[IFNAMSIZ]; /* the logical out-dev */ char logical_out[IFNAMSIZ]; unsigned char sourcemac[ETH_ALEN]; unsigned char sourcemsk[ETH_ALEN]; unsigned char destmac[ETH_ALEN]; unsigned char destmsk[ETH_ALEN]; /* sizeof ebt_entry + matches */ unsigned int watchers_offset; /* sizeof ebt_entry + matches + watchers */ unsigned int target_offset; /* sizeof ebt_entry + matches + watchers + target */ unsigned int next_offset; unsigned char elems[0] __attribute__ ((aligned (__alignof__(struct ebt_replace)))); }; static __inline__ struct ebt_entry_target * ebt_get_target(struct ebt_entry *e) { return (void *)e + e->target_offset; } /* {g,s}etsockopt numbers */ #define EBT_BASE_CTL 128 #define EBT_SO_SET_ENTRIES (EBT_BASE_CTL) #define EBT_SO_SET_COUNTERS (EBT_SO_SET_ENTRIES+1) #define EBT_SO_SET_MAX (EBT_SO_SET_COUNTERS+1) #define EBT_SO_GET_INFO (EBT_BASE_CTL) #define EBT_SO_GET_ENTRIES (EBT_SO_GET_INFO+1) #define EBT_SO_GET_INIT_INFO (EBT_SO_GET_ENTRIES+1) #define EBT_SO_GET_INIT_ENTRIES (EBT_SO_GET_INIT_INFO+1) #define EBT_SO_GET_MAX (EBT_SO_GET_INIT_ENTRIES+1) /* blatently stolen from ip_tables.h * fn returns 0 to continue iteration */ #define EBT_MATCH_ITERATE(e, fn, args...) \ ({ \ unsigned int __i; \ int __ret = 0; \ struct ebt_entry_match *__match; \ \ for (__i = sizeof(struct ebt_entry); \ __i < (e)->watchers_offset; \ __i += __match->match_size + \ sizeof(struct ebt_entry_match)) { \ __match = (void *)(e) + __i; \ \ __ret = fn(__match , ## args); \ if (__ret != 0) \ break; \ } \ if (__ret == 0) { \ if (__i != (e)->watchers_offset) \ __ret = -EINVAL; \ } \ __ret; \ }) #define EBT_WATCHER_ITERATE(e, fn, args...) \ ({ \ unsigned int __i; \ int __ret = 0; \ struct ebt_entry_watcher *__watcher; \ \ for (__i = e->watchers_offset; \ __i < (e)->target_offset; \ __i += __watcher->watcher_size + \ sizeof(struct ebt_entry_watcher)) { \ __watcher = (void *)(e) + __i; \ \ __ret = fn(__watcher , ## args); \ if (__ret != 0) \ break; \ } \ if (__ret == 0) { \ if (__i != (e)->target_offset) \ __ret = -EINVAL; \ } \ __ret; \ }) #define EBT_ENTRY_ITERATE(entries, size, fn, args...) \ ({ \ unsigned int __i; \ int __ret = 0; \ struct ebt_entry *__entry; \ \ for (__i = 0; __i < (size);) { \ __entry = (void *)(entries) + __i; \ __ret = fn(__entry , ## args); \ if (__ret != 0) \ break; \ if (__entry->bitmask != 0) \ __i += __entry->next_offset; \ else \ __i += sizeof(struct ebt_entries); \ } \ if (__ret == 0) { \ if (__i != (size)) \ __ret = -EINVAL; \ } \ __ret; \ }) #endif /* _UAPI__LINUX_BRIDGE_EFF_H */ |
41 40 39 38 37 37 28 41 28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | /* String matching match for iptables * * (C) 2005 Pablo Neira Ayuso <pablo@eurodev.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include <linux/gfp.h> #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_string.h> #include <linux/textsearch.h> MODULE_AUTHOR("Pablo Neira Ayuso <pablo@eurodev.net>"); MODULE_DESCRIPTION("Xtables: string-based matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_string"); MODULE_ALIAS("ip6t_string"); MODULE_ALIAS("ebt_string"); static bool string_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_string_info *conf = par->matchinfo; bool invert; invert = conf->u.v1.flags & XT_STRING_FLAG_INVERT; return (skb_find_text((struct sk_buff *)skb, conf->from_offset, conf->to_offset, conf->config) != UINT_MAX) ^ invert; } #define STRING_TEXT_PRIV(m) ((struct xt_string_info *)(m)) static int string_mt_check(const struct xt_mtchk_param *par) { struct xt_string_info *conf = par->matchinfo; struct ts_config *ts_conf; int flags = TS_AUTOLOAD; /* Damn, can't handle this case properly with iptables... */ if (conf->from_offset > conf->to_offset) return -EINVAL; if (conf->algo[XT_STRING_MAX_ALGO_NAME_SIZE - 1] != '\0') return -EINVAL; if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE) return -EINVAL; if (conf->u.v1.flags & ~(XT_STRING_FLAG_IGNORECASE | XT_STRING_FLAG_INVERT)) return -EINVAL; if (conf->u.v1.flags & XT_STRING_FLAG_IGNORECASE) flags |= TS_IGNORECASE; ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen, GFP_KERNEL, flags); if (IS_ERR(ts_conf)) return PTR_ERR(ts_conf); conf->config = ts_conf; return 0; } static void string_mt_destroy(const struct xt_mtdtor_param *par) { textsearch_destroy(STRING_TEXT_PRIV(par->matchinfo)->config); } static struct xt_match xt_string_mt_reg __read_mostly = { .name = "string", .revision = 1, .family = NFPROTO_UNSPEC, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, .matchsize = sizeof(struct xt_string_info), .usersize = offsetof(struct xt_string_info, config), .me = THIS_MODULE, }; static int __init string_mt_init(void) { return xt_register_match(&xt_string_mt_reg); } static void __exit string_mt_exit(void) { xt_unregister_match(&xt_string_mt_reg); } module_init(string_mt_init); module_exit(string_mt_exit); |
560 9 6 525 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | /* * ip_vs_est.c: simple rate estimator for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * Network name space (netns) aware. * Global data moved to netns i.e struct netns_ipvs * Affected data: est_list and est_lock. * estimation_timer() runs with timer per netns. * get_stats()) do the per cpu summing. */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/types.h> #include <linux/interrupt.h> #include <linux/sysctl.h> #include <linux/list.h> #include <net/ip_vs.h> /* This code is to estimate rate in a shorter interval (such as 8 seconds) for virtual services and real servers. For measure rate in a long interval, it is easy to implement a user level daemon which periodically reads those statistical counters and measure rate. Currently, the measurement is activated by slow timer handler. Hope this measurement will not introduce too much load. We measure rate during the last 8 seconds every 2 seconds: avgrate = avgrate*(1-W) + rate*W where W = 2^(-2) NOTES. * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. * Netlink users can see 64-bit values but sockopt users are restricted to 32-bit values for conns, packets, bps, cps and pps. * A lot of code is taken from net/core/gen_estimator.c */ /* * Make a summary from each cpu */ static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, struct ip_vs_cpu_stats __percpu *stats) { int i; bool add = false; for_each_possible_cpu(i) { struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); unsigned int start; u64 conns, inpkts, outpkts, inbytes, outbytes; if (add) { do { start = u64_stats_fetch_begin(&s->syncp); conns = s->cnt.conns; inpkts = s->cnt.inpkts; outpkts = s->cnt.outpkts; inbytes = s->cnt.inbytes; outbytes = s->cnt.outbytes; } while (u64_stats_fetch_retry(&s->syncp, start)); sum->conns += conns; sum->inpkts += inpkts; sum->outpkts += outpkts; sum->inbytes += inbytes; sum->outbytes += outbytes; } else { add = true; do { start = u64_stats_fetch_begin(&s->syncp); sum->conns = s->cnt.conns; sum->inpkts = s->cnt.inpkts; sum->outpkts = s->cnt.outpkts; sum->inbytes = s->cnt.inbytes; sum->outbytes = s->cnt.outbytes; } while (u64_stats_fetch_retry(&s->syncp, start)); } } } static void estimation_timer(struct timer_list *t) { struct ip_vs_estimator *e; struct ip_vs_stats *s; u64 rate; struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); spin_lock(&ipvs->est_lock); list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); spin_lock(&s->lock); ip_vs_read_cpu_stats(&s->kstats, s->cpustats); /* scaled by 2^10, but divided 2 seconds */ rate = (s->kstats.conns - e->last_conns) << 9; e->last_conns = s->kstats.conns; e->cps += ((s64)rate - (s64)e->cps) >> 2; rate = (s->kstats.inpkts - e->last_inpkts) << 9; e->last_inpkts = s->kstats.inpkts; e->inpps += ((s64)rate - (s64)e->inpps) >> 2; rate = (s->kstats.outpkts - e->last_outpkts) << 9; e->last_outpkts = s->kstats.outpkts; e->outpps += ((s64)rate - (s64)e->outpps) >> 2; /* scaled by 2^5, but divided 2 seconds */ rate = (s->kstats.inbytes - e->last_inbytes) << 4; e->last_inbytes = s->kstats.inbytes; e->inbps += ((s64)rate - (s64)e->inbps) >> 2; rate = (s->kstats.outbytes - e->last_outbytes) << 4; e->last_outbytes = s->kstats.outbytes; e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } spin_unlock(&ipvs->est_lock); mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; INIT_LIST_HEAD(&est->list); spin_lock_bh(&ipvs->est_lock); list_add(&est->list, &ipvs->est_list); spin_unlock_bh(&ipvs->est_lock); } void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; spin_lock_bh(&ipvs->est_lock); list_del(&est->list); spin_unlock_bh(&ipvs->est_lock); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; struct ip_vs_kstats *k = &stats->kstats; /* reset counters, caller must hold the stats->lock lock */ est->last_inbytes = k->inbytes; est->last_outbytes = k->outbytes; est->last_conns = k->conns; est->last_inpkts = k->inpkts; est->last_outpkts = k->outpkts; est->cps = 0; est->inpps = 0; est->outpps = 0; est->inbps = 0; est->outbps = 0; } /* Get decoded rates */ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) { struct ip_vs_estimator *e = &stats->est; dst->cps = (e->cps + 0x1FF) >> 10; dst->inpps = (e->inpps + 0x1FF) >> 10; dst->outpps = (e->outpps + 0x1FF) >> 10; dst->inbps = (e->inbps + 0xF) >> 5; dst->outbps = (e->outbps + 0xF) >> 5; } int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { INIT_LIST_HEAD(&ipvs->est_list); spin_lock_init(&ipvs->est_lock); timer_setup(&ipvs->est_timer, estimation_timer, 0); mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); return 0; } void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { del_timer_sync(&ipvs->est_timer); } |
45 45 41 3 43 43 39 39 39 41 29 64 65 65 64 37 36 36 115 7 2 4 8 3 2 2 5 3 4 3 2 2 3 85 85 94 94 94 94 1 20 6 3 6 3 3 3 3 20 4 3 3 3 4 2 3 90 86 86 86 86 19 67 86 90 23 2 2 45 3 2 4 2 2 32 5 3 3 3 3 5 2 90 4 2 4 2 9 1 72 10 10 67 28 28 28 28 71 71 71 67 67 67 67 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * * Added support for a Unix98-style ptmx device. * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998 * */ #include <linux/module.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/fcntl.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/major.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/device.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/devpts_fs.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/mount.h> #include <linux/file.h> #include <linux/ioctl.h> #include <linux/compat.h> #undef TTY_DEBUG_HANGUP #ifdef TTY_DEBUG_HANGUP # define tty_debug_hangup(tty, f, args...) tty_debug(tty, f, ##args) #else # define tty_debug_hangup(tty, f, args...) do {} while (0) #endif #ifdef CONFIG_UNIX98_PTYS static struct tty_driver *ptm_driver; static struct tty_driver *pts_driver; static DEFINE_MUTEX(devpts_mutex); #endif static void pty_close(struct tty_struct *tty, struct file *filp) { BUG_ON(!tty); if (tty->driver->subtype == PTY_TYPE_MASTER) WARN_ON(tty->count > 1); else { if (tty_io_error(tty)) return; if (tty->count > 2) return; } set_bit(TTY_IO_ERROR, &tty->flags); wake_up_interruptible(&tty->read_wait); wake_up_interruptible(&tty->write_wait); spin_lock_irq(&tty->ctrl_lock); tty->packet = 0; spin_unlock_irq(&tty->ctrl_lock); /* Review - krefs on tty_link ?? */ if (!tty->link) return; set_bit(TTY_OTHER_CLOSED, &tty->link->flags); wake_up_interruptible(&tty->link->read_wait); wake_up_interruptible(&tty->link->write_wait); if (tty->driver->subtype == PTY_TYPE_MASTER) { set_bit(TTY_OTHER_CLOSED, &tty->flags); #ifdef CONFIG_UNIX98_PTYS if (tty->driver == ptm_driver) { mutex_lock(&devpts_mutex); if (tty->link->driver_data) devpts_pty_kill(tty->link->driver_data); mutex_unlock(&devpts_mutex); } #endif tty_vhangup(tty->link); } } /* * The unthrottle routine is called by the line discipline to signal * that it can receive more characters. For PTY's, the TTY_THROTTLED * flag is always set, to force the line discipline to always call the * unthrottle routine when there are fewer than TTY_THRESHOLD_UNTHROTTLE * characters in the queue. This is necessary since each time this * happens, we need to wake up any sleeping processes that could be * (1) trying to send data to the pty, or (2) waiting in wait_until_sent() * for the pty buffer to be drained. */ static void pty_unthrottle(struct tty_struct *tty) { tty_wakeup(tty->link); set_bit(TTY_THROTTLED, &tty->flags); } /** * pty_write - write to a pty * @tty: the tty we write from * @buf: kernel buffer of data * @count: bytes to write * * Our "hardware" write method. Data is coming from the ldisc which * may be in a non sleeping state. We simply throw this at the other * end of the link as if we were an IRQ handler receiving stuff for * the other side of the pty/tty pair. */ static int pty_write(struct tty_struct *tty, const unsigned char *buf, int c) { struct tty_struct *to = tty->link; if (tty->stopped || !c) return 0; return tty_insert_flip_string_and_push_buffer(to->port, buf, c); } /** * pty_write_room - write space * @tty: tty we are writing from * * Report how many bytes the ldisc can send into the queue for * the other device. */ static int pty_write_room(struct tty_struct *tty) { if (tty->stopped) return 0; return tty_buffer_space_avail(tty->link->port); } /** * pty_chars_in_buffer - characters currently in our tx queue * @tty: our tty * * Report how much we have in the transmit queue. As everything is * instantly at the other end this is easy to implement. */ static int pty_chars_in_buffer(struct tty_struct *tty) { return 0; } /* Set the lock flag on a pty */ static int pty_set_lock(struct tty_struct *tty, int __user *arg) { int val; if (get_user(val, arg)) return -EFAULT; if (val) set_bit(TTY_PTY_LOCK, &tty->flags); else clear_bit(TTY_PTY_LOCK, &tty->flags); return 0; } static int pty_get_lock(struct tty_struct *tty, int __user *arg) { int locked = test_bit(TTY_PTY_LOCK, &tty->flags); return put_user(locked, arg); } /* Set the packet mode on a pty */ static int pty_set_pktmode(struct tty_struct *tty, int __user *arg) { int pktmode; if (get_user(pktmode, arg)) return -EFAULT; spin_lock_irq(&tty->ctrl_lock); if (pktmode) { if (!tty->packet) { tty->link->ctrl_status = 0; smp_mb(); tty->packet = 1; } } else tty->packet = 0; spin_unlock_irq(&tty->ctrl_lock); return 0; } /* Get the packet mode of a pty */ static int pty_get_pktmode(struct tty_struct *tty, int __user *arg) { int pktmode = tty->packet; return put_user(pktmode, arg); } /* Send a signal to the slave */ static int pty_signal(struct tty_struct *tty, int sig) { struct pid *pgrp; if (sig != SIGINT && sig != SIGQUIT && sig != SIGTSTP) return -EINVAL; if (tty->link) { pgrp = tty_get_pgrp(tty->link); if (pgrp) kill_pgrp(pgrp, sig, 1); put_pid(pgrp); } return 0; } static void pty_flush_buffer(struct tty_struct *tty) { struct tty_struct *to = tty->link; if (!to) return; tty_buffer_flush(to, NULL); if (to->packet) { spin_lock_irq(&tty->ctrl_lock); tty->ctrl_status |= TIOCPKT_FLUSHWRITE; wake_up_interruptible(&to->read_wait); spin_unlock_irq(&tty->ctrl_lock); } } static int pty_open(struct tty_struct *tty, struct file *filp) { if (!tty || !tty->link) return -ENODEV; if (test_bit(TTY_OTHER_CLOSED, &tty->flags)) goto out; if (test_bit(TTY_PTY_LOCK, &tty->link->flags)) goto out; if (tty->driver->subtype == PTY_TYPE_SLAVE && tty->link->count != 1) goto out; clear_bit(TTY_IO_ERROR, &tty->flags); clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); set_bit(TTY_THROTTLED, &tty->flags); return 0; out: set_bit(TTY_IO_ERROR, &tty->flags); return -EIO; } static void pty_set_termios(struct tty_struct *tty, struct ktermios *old_termios) { /* See if packet mode change of state. */ if (tty->link && tty->link->packet) { int extproc = (old_termios->c_lflag & EXTPROC) | L_EXTPROC(tty); int old_flow = ((old_termios->c_iflag & IXON) && (old_termios->c_cc[VSTOP] == '\023') && (old_termios->c_cc[VSTART] == '\021')); int new_flow = (I_IXON(tty) && STOP_CHAR(tty) == '\023' && START_CHAR(tty) == '\021'); if ((old_flow != new_flow) || extproc) { spin_lock_irq(&tty->ctrl_lock); if (old_flow != new_flow) { tty->ctrl_status &= ~(TIOCPKT_DOSTOP | TIOCPKT_NOSTOP); if (new_flow) tty->ctrl_status |= TIOCPKT_DOSTOP; else tty->ctrl_status |= TIOCPKT_NOSTOP; } if (extproc) tty->ctrl_status |= TIOCPKT_IOCTL; spin_unlock_irq(&tty->ctrl_lock); wake_up_interruptible(&tty->link->read_wait); } } tty->termios.c_cflag &= ~(CSIZE | PARENB); tty->termios.c_cflag |= (CS8 | CREAD); } /** * pty_do_resize - resize event * @tty: tty being resized * @ws: window size being set. * * Update the termios variables and send the necessary signals to * peform a terminal resize correctly */ static int pty_resize(struct tty_struct *tty, struct winsize *ws) { struct pid *pgrp, *rpgrp; struct tty_struct *pty = tty->link; /* For a PTY we need to lock the tty side */ mutex_lock(&tty->winsize_mutex); if (!memcmp(ws, &tty->winsize, sizeof(*ws))) goto done; /* Signal the foreground process group of both ptys */ pgrp = tty_get_pgrp(tty); rpgrp = tty_get_pgrp(pty); if (pgrp) kill_pgrp(pgrp, SIGWINCH, 1); if (rpgrp != pgrp && rpgrp) kill_pgrp(rpgrp, SIGWINCH, 1); put_pid(pgrp); put_pid(rpgrp); tty->winsize = *ws; pty->winsize = *ws; /* Never used so will go away soon */ done: mutex_unlock(&tty->winsize_mutex); return 0; } /** * pty_start - start() handler * pty_stop - stop() handler * @tty: tty being flow-controlled * * Propagates the TIOCPKT status to the master pty. * * NB: only the master pty can be in packet mode so only the slave * needs start()/stop() handlers */ static void pty_start(struct tty_struct *tty) { unsigned long flags; if (tty->link && tty->link->packet) { spin_lock_irqsave(&tty->ctrl_lock, flags); tty->ctrl_status &= ~TIOCPKT_STOP; tty->ctrl_status |= TIOCPKT_START; spin_unlock_irqrestore(&tty->ctrl_lock, flags); wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN); } } static void pty_stop(struct tty_struct *tty) { unsigned long flags; if (tty->link && tty->link->packet) { spin_lock_irqsave(&tty->ctrl_lock, flags); tty->ctrl_status &= ~TIOCPKT_START; tty->ctrl_status |= TIOCPKT_STOP; spin_unlock_irqrestore(&tty->ctrl_lock, flags); wake_up_interruptible_poll(&tty->link->read_wait, EPOLLIN); } } /** * pty_common_install - set up the pty pair * @driver: the pty driver * @tty: the tty being instantiated * @legacy: true if this is BSD style * * Perform the initial set up for the tty/pty pair. Called from the * tty layer when the port is first opened. * * Locking: the caller must hold the tty_mutex */ static int pty_common_install(struct tty_driver *driver, struct tty_struct *tty, bool legacy) { struct tty_struct *o_tty; struct tty_port *ports[2]; int idx = tty->index; int retval = -ENOMEM; /* Opening the slave first has always returned -EIO */ if (driver->subtype != PTY_TYPE_MASTER) return -EIO; ports[0] = kmalloc(sizeof **ports, GFP_KERNEL); ports[1] = kmalloc(sizeof **ports, GFP_KERNEL); if (!ports[0] || !ports[1]) goto err; if (!try_module_get(driver->other->owner)) { /* This cannot in fact currently happen */ goto err; } o_tty = alloc_tty_struct(driver->other, idx); if (!o_tty) goto err_put_module; tty_set_lock_subclass(o_tty); lockdep_set_subclass(&o_tty->termios_rwsem, TTY_LOCK_SLAVE); if (legacy) { /* We always use new tty termios data so we can do this the easy way .. */ tty_init_termios(tty); tty_init_termios(o_tty); driver->other->ttys[idx] = o_tty; driver->ttys[idx] = tty; } else { memset(&tty->termios_locked, 0, sizeof(tty->termios_locked)); tty->termios = driver->init_termios; memset(&o_tty->termios_locked, 0, sizeof(tty->termios_locked)); o_tty->termios = driver->other->init_termios; } /* * Everything allocated ... set up the o_tty structure. */ tty_driver_kref_get(driver->other); /* Establish the links in both directions */ tty->link = o_tty; o_tty->link = tty; tty_port_init(ports[0]); tty_port_init(ports[1]); tty_buffer_set_limit(ports[0], 8192); tty_buffer_set_limit(ports[1], 8192); o_tty->port = ports[0]; tty->port = ports[1]; o_tty->port->itty = o_tty; tty_buffer_set_lock_subclass(o_tty->port); tty_driver_kref_get(driver); tty->count++; o_tty->count++; return 0; err_put_module: module_put(driver->other->owner); err: kfree(ports[0]); kfree(ports[1]); return retval; } static void pty_cleanup(struct tty_struct *tty) { tty_port_put(tty->port); } /* Traditional BSD devices */ #ifdef CONFIG_LEGACY_PTYS static int pty_install(struct tty_driver *driver, struct tty_struct *tty) { return pty_common_install(driver, tty, true); } static void pty_remove(struct tty_driver *driver, struct tty_struct *tty) { struct tty_struct *pair = tty->link; driver->ttys[tty->index] = NULL; if (pair) pair->driver->ttys[pair->index] = NULL; } static int pty_bsd_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { switch (cmd) { case TIOCSPTLCK: /* Set PT Lock (disallow slave open) */ return pty_set_lock(tty, (int __user *) arg); case TIOCGPTLCK: /* Get PT Lock status */ return pty_get_lock(tty, (int __user *)arg); case TIOCPKT: /* Set PT packet mode */ return pty_set_pktmode(tty, (int __user *)arg); case TIOCGPKT: /* Get PT packet mode */ return pty_get_pktmode(tty, (int __user *)arg); case TIOCSIG: /* Send signal to other side of pty */ return pty_signal(tty, (int) arg); case TIOCGPTN: /* TTY returns ENOTTY, but glibc expects EINVAL here */ return -EINVAL; } return -ENOIOCTLCMD; } #ifdef CONFIG_COMPAT static long pty_bsd_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { /* * PTY ioctls don't require any special translation between 32-bit and * 64-bit userspace, they are already compatible. */ return pty_bsd_ioctl(tty, cmd, (unsigned long)compat_ptr(arg)); } #else #define pty_bsd_compat_ioctl NULL #endif static int legacy_count = CONFIG_LEGACY_PTY_COUNT; /* * not really modular, but the easiest way to keep compat with existing * bootargs behaviour is to continue using module_param here. */ module_param(legacy_count, int, 0); /* * The master side of a pty can do TIOCSPTLCK and thus * has pty_bsd_ioctl. */ static const struct tty_operations master_pty_ops_bsd = { .install = pty_install, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .chars_in_buffer = pty_chars_in_buffer, .unthrottle = pty_unthrottle, .ioctl = pty_bsd_ioctl, .compat_ioctl = pty_bsd_compat_ioctl, .cleanup = pty_cleanup, .resize = pty_resize, .remove = pty_remove }; static const struct tty_operations slave_pty_ops_bsd = { .install = pty_install, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .chars_in_buffer = pty_chars_in_buffer, .unthrottle = pty_unthrottle, .set_termios = pty_set_termios, .cleanup = pty_cleanup, .resize = pty_resize, .start = pty_start, .stop = pty_stop, .remove = pty_remove }; static void __init legacy_pty_init(void) { struct tty_driver *pty_driver, *pty_slave_driver; if (legacy_count <= 0) return; pty_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pty_driver)) panic("Couldn't allocate pty driver"); pty_slave_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pty_slave_driver)) panic("Couldn't allocate pty slave driver"); pty_driver->driver_name = "pty_master"; pty_driver->name = "pty"; pty_driver->major = PTY_MASTER_MAJOR; pty_driver->minor_start = 0; pty_driver->type = TTY_DRIVER_TYPE_PTY; pty_driver->subtype = PTY_TYPE_MASTER; pty_driver->init_termios = tty_std_termios; pty_driver->init_termios.c_iflag = 0; pty_driver->init_termios.c_oflag = 0; pty_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pty_driver->init_termios.c_lflag = 0; pty_driver->init_termios.c_ispeed = 38400; pty_driver->init_termios.c_ospeed = 38400; pty_driver->other = pty_slave_driver; tty_set_operations(pty_driver, &master_pty_ops_bsd); pty_slave_driver->driver_name = "pty_slave"; pty_slave_driver->name = "ttyp"; pty_slave_driver->major = PTY_SLAVE_MAJOR; pty_slave_driver->minor_start = 0; pty_slave_driver->type = TTY_DRIVER_TYPE_PTY; pty_slave_driver->subtype = PTY_TYPE_SLAVE; pty_slave_driver->init_termios = tty_std_termios; pty_slave_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pty_slave_driver->init_termios.c_ispeed = 38400; pty_slave_driver->init_termios.c_ospeed = 38400; pty_slave_driver->other = pty_driver; tty_set_operations(pty_slave_driver, &slave_pty_ops_bsd); if (tty_register_driver(pty_driver)) panic("Couldn't register pty driver"); if (tty_register_driver(pty_slave_driver)) panic("Couldn't register pty slave driver"); } #else static inline void legacy_pty_init(void) { } #endif /* Unix98 devices */ #ifdef CONFIG_UNIX98_PTYS static struct cdev ptmx_cdev; /** * ptm_open_peer - open the peer of a pty * @master: the open struct file of the ptmx device node * @tty: the master of the pty being opened * @flags: the flags for open * * Provide a race free way for userspace to open the slave end of a pty * (where they have the master fd and cannot access or trust the mount * namespace /dev/pts was mounted inside). */ int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags) { int fd = -1; struct file *filp; int retval = -EINVAL; struct path path; if (tty->driver != ptm_driver) return -EIO; fd = get_unused_fd_flags(flags); if (fd < 0) { retval = fd; goto err; } /* Compute the slave's path */ path.mnt = devpts_mntget(master, tty->driver_data); if (IS_ERR(path.mnt)) { retval = PTR_ERR(path.mnt); goto err_put; } path.dentry = tty->link->driver_data; filp = dentry_open(&path, flags, current_cred()); mntput(path.mnt); if (IS_ERR(filp)) { retval = PTR_ERR(filp); goto err_put; } fd_install(fd, filp); return fd; err_put: put_unused_fd(fd); err: return retval; } static int pty_unix98_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { switch (cmd) { case TIOCSPTLCK: /* Set PT Lock (disallow slave open) */ return pty_set_lock(tty, (int __user *)arg); case TIOCGPTLCK: /* Get PT Lock status */ return pty_get_lock(tty, (int __user *)arg); case TIOCPKT: /* Set PT packet mode */ return pty_set_pktmode(tty, (int __user *)arg); case TIOCGPKT: /* Get PT packet mode */ return pty_get_pktmode(tty, (int __user *)arg); case TIOCGPTN: /* Get PT Number */ return put_user(tty->index, (unsigned int __user *)arg); case TIOCSIG: /* Send signal to other side of pty */ return pty_signal(tty, (int) arg); } return -ENOIOCTLCMD; } #ifdef CONFIG_COMPAT static long pty_unix98_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { /* * PTY ioctls don't require any special translation between 32-bit and * 64-bit userspace, they are already compatible. */ return pty_unix98_ioctl(tty, cmd, cmd == TIOCSIG ? arg : (unsigned long)compat_ptr(arg)); } #else #define pty_unix98_compat_ioctl NULL #endif /** * ptm_unix98_lookup - find a pty master * @driver: ptm driver * @idx: tty index * * Look up a pty master device. Called under the tty_mutex for now. * This provides our locking. */ static struct tty_struct *ptm_unix98_lookup(struct tty_driver *driver, struct file *file, int idx) { /* Master must be open via /dev/ptmx */ return ERR_PTR(-EIO); } /** * pts_unix98_lookup - find a pty slave * @driver: pts driver * @idx: tty index * * Look up a pty master device. Called under the tty_mutex for now. * This provides our locking for the tty pointer. */ static struct tty_struct *pts_unix98_lookup(struct tty_driver *driver, struct file *file, int idx) { struct tty_struct *tty; mutex_lock(&devpts_mutex); tty = devpts_get_priv(file->f_path.dentry); mutex_unlock(&devpts_mutex); /* Master must be open before slave */ if (!tty) return ERR_PTR(-EIO); return tty; } static int pty_unix98_install(struct tty_driver *driver, struct tty_struct *tty) { return pty_common_install(driver, tty, false); } /* this is called once with whichever end is closed last */ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty) { struct pts_fs_info *fsi; if (tty->driver->subtype == PTY_TYPE_MASTER) fsi = tty->driver_data; else fsi = tty->link->driver_data; if (fsi) { devpts_kill_index(fsi, tty->index); devpts_release(fsi); } } static void pty_show_fdinfo(struct tty_struct *tty, struct seq_file *m) { seq_printf(m, "tty-index:\t%d\n", tty->index); } static const struct tty_operations ptm_unix98_ops = { .lookup = ptm_unix98_lookup, .install = pty_unix98_install, .remove = pty_unix98_remove, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .chars_in_buffer = pty_chars_in_buffer, .unthrottle = pty_unthrottle, .ioctl = pty_unix98_ioctl, .compat_ioctl = pty_unix98_compat_ioctl, .resize = pty_resize, .cleanup = pty_cleanup, .show_fdinfo = pty_show_fdinfo, }; static const struct tty_operations pty_unix98_ops = { .lookup = pts_unix98_lookup, .install = pty_unix98_install, .remove = pty_unix98_remove, .open = pty_open, .close = pty_close, .write = pty_write, .write_room = pty_write_room, .flush_buffer = pty_flush_buffer, .chars_in_buffer = pty_chars_in_buffer, .unthrottle = pty_unthrottle, .set_termios = pty_set_termios, .start = pty_start, .stop = pty_stop, .cleanup = pty_cleanup, }; /** * ptmx_open - open a unix 98 pty master * @inode: inode of device file * @filp: file pointer to tty * * Allocate a unix98 pty master device from the ptmx driver. * * Locking: tty_mutex protects the init_dev work. tty->count should * protect the rest. * allocated_ptys_lock handles the list of free pty numbers */ static int ptmx_open(struct inode *inode, struct file *filp) { struct pts_fs_info *fsi; struct tty_struct *tty; struct dentry *dentry; int retval; int index; nonseekable_open(inode, filp); /* We refuse fsnotify events on ptmx, since it's a shared resource */ filp->f_mode |= FMODE_NONOTIFY; retval = tty_alloc_file(filp); if (retval) return retval; fsi = devpts_acquire(filp); if (IS_ERR(fsi)) { retval = PTR_ERR(fsi); goto out_free_file; } /* find a device that is not in use. */ mutex_lock(&devpts_mutex); index = devpts_new_index(fsi); mutex_unlock(&devpts_mutex); retval = index; if (index < 0) goto out_put_fsi; mutex_lock(&tty_mutex); tty = tty_init_dev(ptm_driver, index); /* The tty returned here is locked so we can safely drop the mutex */ mutex_unlock(&tty_mutex); retval = PTR_ERR(tty); if (IS_ERR(tty)) goto out; /* * From here on out, the tty is "live", and the index and * fsi will be killed/put by the tty_release() */ set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ tty->driver_data = fsi; tty_add_file(tty, filp); dentry = devpts_pty_new(fsi, index, tty->link); if (IS_ERR(dentry)) { retval = PTR_ERR(dentry); goto err_release; } tty->link->driver_data = dentry; retval = ptm_driver->ops->open(tty, filp); if (retval) goto err_release; tty_debug_hangup(tty, "opening (count=%d)\n", tty->count); tty_unlock(tty); return 0; err_release: tty_unlock(tty); // This will also put-ref the fsi tty_release(inode, filp); return retval; out: devpts_kill_index(fsi, index); out_put_fsi: devpts_release(fsi); out_free_file: tty_free_file(filp); return retval; } static struct file_operations ptmx_fops __ro_after_init; static void __init unix98_pty_init(void) { ptm_driver = tty_alloc_driver(NR_UNIX98_PTY_MAX, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(ptm_driver)) panic("Couldn't allocate Unix98 ptm driver"); pts_driver = tty_alloc_driver(NR_UNIX98_PTY_MAX, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM | TTY_DRIVER_DYNAMIC_ALLOC); if (IS_ERR(pts_driver)) panic("Couldn't allocate Unix98 pts driver"); ptm_driver->driver_name = "pty_master"; ptm_driver->name = "ptm"; ptm_driver->major = UNIX98_PTY_MASTER_MAJOR; ptm_driver->minor_start = 0; ptm_driver->type = TTY_DRIVER_TYPE_PTY; ptm_driver->subtype = PTY_TYPE_MASTER; ptm_driver->init_termios = tty_std_termios; ptm_driver->init_termios.c_iflag = 0; ptm_driver->init_termios.c_oflag = 0; ptm_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; ptm_driver->init_termios.c_lflag = 0; ptm_driver->init_termios.c_ispeed = 38400; ptm_driver->init_termios.c_ospeed = 38400; ptm_driver->other = pts_driver; tty_set_operations(ptm_driver, &ptm_unix98_ops); pts_driver->driver_name = "pty_slave"; pts_driver->name = "pts"; pts_driver->major = UNIX98_PTY_SLAVE_MAJOR; pts_driver->minor_start = 0; pts_driver->type = TTY_DRIVER_TYPE_PTY; pts_driver->subtype = PTY_TYPE_SLAVE; pts_driver->init_termios = tty_std_termios; pts_driver->init_termios.c_cflag = B38400 | CS8 | CREAD; pts_driver->init_termios.c_ispeed = 38400; pts_driver->init_termios.c_ospeed = 38400; pts_driver->other = ptm_driver; tty_set_operations(pts_driver, &pty_unix98_ops); if (tty_register_driver(ptm_driver)) panic("Couldn't register Unix98 ptm driver"); if (tty_register_driver(pts_driver)) panic("Couldn't register Unix98 pts driver"); /* Now create the /dev/ptmx special device */ tty_default_fops(&ptmx_fops); ptmx_fops.open = ptmx_open; cdev_init(&ptmx_cdev, &ptmx_fops); if (cdev_add(&ptmx_cdev, MKDEV(TTYAUX_MAJOR, 2), 1) || register_chrdev_region(MKDEV(TTYAUX_MAJOR, 2), 1, "/dev/ptmx") < 0) panic("Couldn't register /dev/ptmx driver"); device_create(tty_class, NULL, MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx"); } #else static inline void unix98_pty_init(void) { } #endif static int __init pty_init(void) { legacy_pty_init(); unix98_pty_init(); return 0; } device_initcall(pty_init); |
2 2 2 2 1 1 1 31 31 31 31 31 31 31 31 3 1 3 11 11 3 1 1 3 33 31 17 33 12 1 10 5 5 12 11 12 85 11 5 11 11 13 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 | /* * Created: Tue Feb 2 08:37:54 1999 by faith@valinux.com * * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Author Rickard E. (Rik) Faith <faith@valinux.com> * Author Gareth Hughes <gareth@valinux.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <drm/drmP.h> #include "drm_internal.h" #include "drm_legacy.h" #include <drm/drm_lease.h> /** * DOC: master and authentication * * &struct drm_master is used to track groups of clients with open * primary/legacy device nodes. For every &struct drm_file which has had at * least once successfully became the device master (either through the * SET_MASTER IOCTL, or implicitly through opening the primary device node when * no one else is the current master that time) there exists one &drm_master. * This is noted in &drm_file.is_master. All other clients have just a pointer * to the &drm_master they are associated with. * * In addition only one &drm_master can be the current master for a &drm_device. * It can be switched through the DROP_MASTER and SET_MASTER IOCTL, or * implicitly through closing/openeing the primary device node. See also * drm_is_current_master(). * * Clients can authenticate against the current master (if it matches their own) * using the GETMAGIC and AUTHMAGIC IOCTLs. Together with exchanging masters, * this allows controlled access to the device for an entire group of mutually * trusted clients. */ int drm_getmagic(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_auth *auth = data; int ret = 0; mutex_lock(&dev->master_mutex); if (!file_priv->magic) { ret = idr_alloc(&file_priv->master->magic_map, file_priv, 1, 0, GFP_KERNEL); if (ret >= 0) file_priv->magic = ret; } auth->magic = file_priv->magic; mutex_unlock(&dev->master_mutex); DRM_DEBUG("%u\n", auth->magic); return ret < 0 ? ret : 0; } int drm_authmagic(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_auth *auth = data; struct drm_file *file; DRM_DEBUG("%u\n", auth->magic); mutex_lock(&dev->master_mutex); file = idr_find(&file_priv->master->magic_map, auth->magic); if (file) { file->authenticated = 1; idr_replace(&file_priv->master->magic_map, NULL, auth->magic); } mutex_unlock(&dev->master_mutex); return file ? 0 : -EINVAL; } struct drm_master *drm_master_create(struct drm_device *dev) { struct drm_master *master; master = kzalloc(sizeof(*master), GFP_KERNEL); if (!master) return NULL; kref_init(&master->refcount); spin_lock_init(&master->lock.spinlock); init_waitqueue_head(&master->lock.lock_queue); idr_init(&master->magic_map); master->dev = dev; /* initialize the tree of output resource lessees */ master->lessor = NULL; master->lessee_id = 0; INIT_LIST_HEAD(&master->lessees); INIT_LIST_HEAD(&master->lessee_list); idr_init(&master->leases); idr_init(&master->lessee_idr); return master; } static int drm_set_master(struct drm_device *dev, struct drm_file *fpriv, bool new_master) { int ret = 0; dev->master = drm_master_get(fpriv->master); if (dev->driver->master_set) { ret = dev->driver->master_set(dev, fpriv, new_master); if (unlikely(ret != 0)) { drm_master_put(&dev->master); } } return ret; } static int drm_new_set_master(struct drm_device *dev, struct drm_file *fpriv) { struct drm_master *old_master; int ret; lockdep_assert_held_once(&dev->master_mutex); WARN_ON(fpriv->is_master); old_master = fpriv->master; fpriv->master = drm_master_create(dev); if (!fpriv->master) { fpriv->master = old_master; return -ENOMEM; } if (dev->driver->master_create) { ret = dev->driver->master_create(dev, fpriv->master); if (ret) goto out_err; } fpriv->is_master = 1; fpriv->authenticated = 1; ret = drm_set_master(dev, fpriv, true); if (ret) goto out_err; if (old_master) drm_master_put(&old_master); return 0; out_err: /* drop references and restore old master on failure */ drm_master_put(&fpriv->master); fpriv->master = old_master; fpriv->is_master = 0; return ret; } int drm_setmaster_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { int ret = 0; mutex_lock(&dev->master_mutex); if (drm_is_current_master(file_priv)) goto out_unlock; if (dev->master) { ret = -EINVAL; goto out_unlock; } if (!file_priv->master) { ret = -EINVAL; goto out_unlock; } if (!file_priv->is_master) { ret = drm_new_set_master(dev, file_priv); goto out_unlock; } if (file_priv->master->lessor != NULL) { DRM_DEBUG_LEASE("Attempt to set lessee %d as master\n", file_priv->master->lessee_id); ret = -EINVAL; goto out_unlock; } ret = drm_set_master(dev, file_priv, false); out_unlock: mutex_unlock(&dev->master_mutex); return ret; } static void drm_drop_master(struct drm_device *dev, struct drm_file *fpriv) { if (dev->driver->master_drop) dev->driver->master_drop(dev, fpriv); drm_master_put(&dev->master); } int drm_dropmaster_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { int ret = -EINVAL; mutex_lock(&dev->master_mutex); if (!drm_is_current_master(file_priv)) goto out_unlock; if (!dev->master) goto out_unlock; if (file_priv->master->lessor != NULL) { DRM_DEBUG_LEASE("Attempt to drop lessee %d as master\n", file_priv->master->lessee_id); ret = -EINVAL; goto out_unlock; } ret = 0; drm_drop_master(dev, file_priv); out_unlock: mutex_unlock(&dev->master_mutex); return ret; } int drm_master_open(struct drm_file *file_priv) { struct drm_device *dev = file_priv->minor->dev; int ret = 0; /* if there is no current master make this fd it, but do not create * any master object for render clients */ mutex_lock(&dev->master_mutex); if (!dev->master) ret = drm_new_set_master(dev, file_priv); else file_priv->master = drm_master_get(dev->master); mutex_unlock(&dev->master_mutex); return ret; } void drm_master_release(struct drm_file *file_priv) { struct drm_device *dev = file_priv->minor->dev; struct drm_master *master; mutex_lock(&dev->master_mutex); master = file_priv->master; if (file_priv->magic) idr_remove(&file_priv->master->magic_map, file_priv->magic); if (!drm_is_current_master(file_priv)) goto out; if (drm_core_check_feature(dev, DRIVER_LEGACY)) { /* * Since the master is disappearing, so is the * possibility to lock. */ mutex_lock(&dev->struct_mutex); if (master->lock.hw_lock) { if (dev->sigdata.lock == master->lock.hw_lock) dev->sigdata.lock = NULL; master->lock.hw_lock = NULL; master->lock.file_priv = NULL; wake_up_interruptible_all(&master->lock.lock_queue); } mutex_unlock(&dev->struct_mutex); } if (dev->master == file_priv->master) drm_drop_master(dev, file_priv); out: if (drm_core_check_feature(dev, DRIVER_MODESET) && file_priv->is_master) { /* Revoke any leases held by this or lessees, but only if * this is the "real" master */ drm_lease_revoke(master); } /* drop the master reference held by the file priv */ if (file_priv->master) drm_master_put(&file_priv->master); mutex_unlock(&dev->master_mutex); } /** * drm_is_current_master - checks whether @priv is the current master * @fpriv: DRM file private * * Checks whether @fpriv is current master on its device. This decides whether a * client is allowed to run DRM_MASTER IOCTLs. * * Most of the modern IOCTL which require DRM_MASTER are for kernel modesetting * - the current master is assumed to own the non-shareable display hardware. */ bool drm_is_current_master(struct drm_file *fpriv) { return fpriv->is_master && drm_lease_owner(fpriv->master) == fpriv->minor->dev->master; } EXPORT_SYMBOL(drm_is_current_master); /** * drm_master_get - reference a master pointer * @master: &struct drm_master * * Increments the reference count of @master and returns a pointer to @master. */ struct drm_master *drm_master_get(struct drm_master *master) { kref_get(&master->refcount); return master; } EXPORT_SYMBOL(drm_master_get); static void drm_master_destroy(struct kref *kref) { struct drm_master *master = container_of(kref, struct drm_master, refcount); struct drm_device *dev = master->dev; if (drm_core_check_feature(dev, DRIVER_MODESET)) drm_lease_destroy(master); if (dev->driver->master_destroy) dev->driver->master_destroy(dev, master); drm_legacy_master_rmmaps(dev, master); idr_destroy(&master->magic_map); idr_destroy(&master->leases); idr_destroy(&master->lessee_idr); kfree(master->unique); kfree(master); } /** * drm_master_put - unreference and clear a master pointer * @master: pointer to a pointer of &struct drm_master * * This decrements the &drm_master behind @master and sets it to NULL. */ void drm_master_put(struct drm_master **master) { kref_put(&(*master)->refcount, drm_master_destroy); *master = NULL; } EXPORT_SYMBOL(drm_master_put); |
3 2 2 2 53 74 74 74 46 74 137 74 9 73 74 78 1 34 336 35 113 114 109 24 20 20 2 2 1 2 2 2 20 2 2 38 18 18 18 18 18 1 18 38 20 19 19 18 162 22 162 102 3 160 163 2 2 2 128 26 204 48 48 29 2 29 22 22 168 168 283 236 282 281 282 60 58 54 58 56 58 59 15 8 8 8 8 8 2 15 336 40 39 39 28 85 86 62 109 96 95 96 96 102 95 95 95 102 188 189 144 286 256 2 254 1 253 1 286 281 3 3 3 3 249 31 128 105 146 29 145 1 1 1 145 28 132 140 130 87 127 264 256 114 114 103 264 110 110 110 14 95 110 109 110 110 109 102 102 102 1 1 22 2 1 19 19 19 1 17 18 18 18 18 19 19 6 6 6 49 42 49 1 29 7 24 49 95 11 11 84 248 250 250 250 95 95 62 300 301 301 293 300 1 299 250 42 7 299 24 22 21 24 27 23 3 21 20 20 20 1 20 20 20 20 2 2 20 27 612 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 | /* * net/sched/act_api.c Packet action API. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Author: Jamal Hadi Salim * * */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/err.h> #include <linux/module.h> #include <linux/rhashtable.h> #include <linux/list.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> #include <net/act_api.h> #include <net/netlink.h> static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp) { u32 chain_index = a->tcfa_action & TC_ACT_EXT_VAL_MASK; if (!tp) return -EINVAL; a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index); if (!a->goto_chain) return -ENOMEM; return 0; } static void tcf_action_goto_chain_fini(struct tc_action *a) { tcf_chain_put_by_act(a->goto_chain); } static void tcf_action_goto_chain_exec(const struct tc_action *a, struct tcf_result *res) { const struct tcf_chain *chain = a->goto_chain; res->goto_tp = rcu_dereference_bh(chain->filter_chain); } static void tcf_free_cookie_rcu(struct rcu_head *p) { struct tc_cookie *cookie = container_of(p, struct tc_cookie, rcu); kfree(cookie->data); kfree(cookie); } static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie, struct tc_cookie *new_cookie) { struct tc_cookie *old; old = xchg((__force struct tc_cookie **)old_cookie, new_cookie); if (old) call_rcu(&old->rcu, tcf_free_cookie_rcu); } /* XXX: For standalone actions, we don't need a RCU grace period either, because * actions are always connected to filters and filters are already destroyed in * RCU callbacks, so after a RCU grace period actions are already disconnected * from filters. Readers later can not find us. */ static void free_tcf(struct tc_action *p) { free_percpu(p->cpu_bstats); free_percpu(p->cpu_qstats); tcf_set_action_cookie(&p->act_cookie, NULL); if (p->goto_chain) tcf_action_goto_chain_fini(p); kfree(p); } static void tcf_action_cleanup(struct tc_action *p) { if (p->ops->cleanup) p->ops->cleanup(p); gen_kill_estimator(&p->tcfa_rate_est); free_tcf(p); } static int __tcf_action_put(struct tc_action *p, bool bind) { struct tcf_idrinfo *idrinfo = p->idrinfo; if (refcount_dec_and_lock(&p->tcfa_refcnt, &idrinfo->lock)) { if (bind) atomic_dec(&p->tcfa_bindcnt); idr_remove(&idrinfo->action_idr, p->tcfa_index); spin_unlock(&idrinfo->lock); tcf_action_cleanup(p); return 1; } if (bind) atomic_dec(&p->tcfa_bindcnt); return 0; } int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) { int ret = 0; /* Release with strict==1 and bind==0 is only called through act API * interface (classifiers always bind). Only case when action with * positive reference count and zero bind count can exist is when it was * also created with act API (unbinding last classifier will destroy the * action if it was created by classifier). So only case when bind count * can be changed after initial check is when unbound action is * destroyed by act API while classifier binds to action with same id * concurrently. This result either creation of new action(same behavior * as before), or reusing existing action if concurrent process * increments reference count before action is deleted. Both scenarios * are acceptable. */ if (p) { if (!bind && strict && atomic_read(&p->tcfa_bindcnt) > 0) return -EPERM; if (__tcf_action_put(p, bind)) ret = ACT_P_DELETED; } return ret; } EXPORT_SYMBOL(__tcf_idr_release); static size_t tcf_action_shared_attrs_size(const struct tc_action *act) { struct tc_cookie *act_cookie; u32 cookie_len = 0; rcu_read_lock(); act_cookie = rcu_dereference(act->act_cookie); if (act_cookie) cookie_len = nla_total_size(act_cookie->len); rcu_read_unlock(); return nla_total_size(0) /* action number nested */ + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */ + cookie_len /* TCA_ACT_COOKIE */ + nla_total_size(0) /* TCA_ACT_STATS nested */ /* TCA_STATS_BASIC */ + nla_total_size_64bit(sizeof(struct gnet_stats_basic)) /* TCA_STATS_QUEUE */ + nla_total_size_64bit(sizeof(struct gnet_stats_queue)) + nla_total_size(0) /* TCA_OPTIONS nested */ + nla_total_size(sizeof(struct tcf_t)); /* TCA_GACT_TM */ } static size_t tcf_action_full_attrs_size(size_t sz) { return NLMSG_HDRLEN /* struct nlmsghdr */ + sizeof(struct tcamsg) + nla_total_size(0) /* TCA_ACT_TAB nested */ + sz; } static size_t tcf_action_fill_size(const struct tc_action *act) { size_t sz = tcf_action_shared_attrs_size(act); if (act->ops->get_fill_size) return act->ops->get_fill_size(act) + sz; return sz; } static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, struct netlink_callback *cb) { int err = 0, index = -1, s_i = 0, n_i = 0; u32 act_flags = cb->args[2]; unsigned long jiffy_since = cb->args[3]; struct nlattr *nest; struct idr *idr = &idrinfo->action_idr; struct tc_action *p; unsigned long id = 1; spin_lock(&idrinfo->lock); s_i = cb->args[0]; idr_for_each_entry_ul(idr, p, id) { index++; if (index < s_i) continue; if (jiffy_since && time_after(jiffy_since, (unsigned long)p->tcfa_tm.lastuse)) continue; nest = nla_nest_start(skb, n_i); if (!nest) { index--; goto nla_put_failure; } err = tcf_action_dump_1(skb, p, 0, 0); if (err < 0) { index--; nlmsg_trim(skb, nest); goto done; } nla_nest_end(skb, nest); n_i++; if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) && n_i >= TCA_ACT_MAX_PRIO) goto done; } done: if (index >= 0) cb->args[0] = index + 1; spin_unlock(&idrinfo->lock); if (n_i) { if (act_flags & TCA_FLAG_LARGE_DUMP_ON) cb->args[1] = n_i; } return n_i; nla_put_failure: nla_nest_cancel(skb, nest); goto done; } static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, const struct tc_action_ops *ops) { struct nlattr *nest; int n_i = 0; int ret = -EINVAL; struct idr *idr = &idrinfo->action_idr; struct tc_action *p; unsigned long id = 1; nest = nla_nest_start(skb, 0); if (nest == NULL) goto nla_put_failure; if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; idr_for_each_entry_ul(idr, p, id) { ret = __tcf_idr_release(p, false, true); if (ret == ACT_P_DELETED) { module_put(ops->owner); n_i++; } else if (ret < 0) { goto nla_put_failure; } } if (nla_put_u32(skb, TCA_FCNT, n_i)) goto nla_put_failure; nla_nest_end(skb, nest); return n_i; nla_put_failure: nla_nest_cancel(skb, nest); return ret; } int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tcf_idrinfo *idrinfo = tn->idrinfo; if (type == RTM_DELACTION) { return tcf_del_walker(idrinfo, skb, ops); } else if (type == RTM_GETACTION) { return tcf_dump_walker(idrinfo, skb, cb); } else { WARN(1, "tcf_generic_walker: unknown command %d\n", type); NL_SET_ERR_MSG(extack, "tcf_generic_walker: unknown command"); return -EINVAL; } } EXPORT_SYMBOL(tcf_generic_walker); int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index) { struct tcf_idrinfo *idrinfo = tn->idrinfo; struct tc_action *p; spin_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); if (IS_ERR(p)) p = NULL; else if (p) refcount_inc(&p->tcfa_refcnt); spin_unlock(&idrinfo->lock); if (p) { *a = p; return true; } return false; } EXPORT_SYMBOL(tcf_idr_search); static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) { struct tc_action *p; int ret = 0; spin_lock(&idrinfo->lock); p = idr_find(&idrinfo->action_idr, index); if (!p) { spin_unlock(&idrinfo->lock); return -ENOENT; } if (!atomic_read(&p->tcfa_bindcnt)) { if (refcount_dec_and_test(&p->tcfa_refcnt)) { struct module *owner = p->ops->owner; WARN_ON(p != idr_remove(&idrinfo->action_idr, p->tcfa_index)); spin_unlock(&idrinfo->lock); tcf_action_cleanup(p); module_put(owner); return 0; } ret = 0; } else { ret = -EPERM; } spin_unlock(&idrinfo->lock); return ret; } int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, bool cpustats) { struct tc_action *p = kzalloc(ops->size, GFP_KERNEL); struct tcf_idrinfo *idrinfo = tn->idrinfo; int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; refcount_set(&p->tcfa_refcnt, 1); if (bind) atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); if (!p->cpu_bstats) goto err1; p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!p->cpu_qstats) goto err2; } spin_lock_init(&p->tcfa_lock); p->tcfa_index = index; p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; p->tcfa_tm.firstuse = 0; if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, &p->tcfa_lock, NULL, est); if (err) goto err3; } p->idrinfo = idrinfo; p->ops = ops; *a = p; return 0; err3: free_percpu(p->cpu_qstats); err2: free_percpu(p->cpu_bstats); err1: kfree(p); return err; } EXPORT_SYMBOL(tcf_idr_create); void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a) { struct tcf_idrinfo *idrinfo = tn->idrinfo; spin_lock(&idrinfo->lock); /* Replace ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ WARN_ON(!IS_ERR(idr_replace(&idrinfo->action_idr, a, a->tcfa_index))); spin_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_insert); /* Cleanup idr index that was allocated but not initialized. */ void tcf_idr_cleanup(struct tc_action_net *tn, u32 index) { struct tcf_idrinfo *idrinfo = tn->idrinfo; spin_lock(&idrinfo->lock); /* Remove ERR_PTR(-EBUSY) allocated by tcf_idr_check_alloc */ WARN_ON(!IS_ERR(idr_remove(&idrinfo->action_idr, index))); spin_unlock(&idrinfo->lock); } EXPORT_SYMBOL(tcf_idr_cleanup); /* Check if action with specified index exists. If actions is found, increments * its reference and bind counters, and return 1. Otherwise insert temporary * error pointer (to prevent concurrent users from inserting actions with same * index) and return 0. */ int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind) { struct tcf_idrinfo *idrinfo = tn->idrinfo; struct tc_action *p; int ret; again: spin_lock(&idrinfo->lock); if (*index) { p = idr_find(&idrinfo->action_idr, *index); if (IS_ERR(p)) { /* This means that another process allocated * index but did not assign the pointer yet. */ spin_unlock(&idrinfo->lock); goto again; } if (p) { refcount_inc(&p->tcfa_refcnt); if (bind) atomic_inc(&p->tcfa_bindcnt); *a = p; ret = 1; } else { *a = NULL; ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, *index, GFP_ATOMIC); if (!ret) idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), *index); } } else { *index = 1; *a = NULL; ret = idr_alloc_u32(&idrinfo->action_idr, NULL, index, UINT_MAX, GFP_ATOMIC); if (!ret) idr_replace(&idrinfo->action_idr, ERR_PTR(-EBUSY), *index); } spin_unlock(&idrinfo->lock); return ret; } EXPORT_SYMBOL(tcf_idr_check_alloc); void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { struct idr *idr = &idrinfo->action_idr; struct tc_action *p; int ret; unsigned long id = 1; idr_for_each_entry_ul(idr, p, id) { ret = __tcf_idr_release(p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) return; } idr_destroy(&idrinfo->action_idr); } EXPORT_SYMBOL(tcf_idrinfo_destroy); static LIST_HEAD(act_base); static DEFINE_RWLOCK(act_mod_lock); int tcf_register_action(struct tc_action_ops *act, struct pernet_operations *ops) { struct tc_action_ops *a; int ret; if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup) return -EINVAL; /* We have to register pernet ops before making the action ops visible, * otherwise tcf_action_init_1() could get a partially initialized * netns. */ ret = register_pernet_subsys(ops); if (ret) return ret; write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { write_unlock(&act_mod_lock); unregister_pernet_subsys(ops); return -EEXIST; } } list_add_tail(&act->head, &act_base); write_unlock(&act_mod_lock); return 0; } EXPORT_SYMBOL(tcf_register_action); int tcf_unregister_action(struct tc_action_ops *act, struct pernet_operations *ops) { struct tc_action_ops *a; int err = -ENOENT; write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (a == act) { list_del(&act->head); err = 0; break; } } write_unlock(&act_mod_lock); if (!err) unregister_pernet_subsys(ops); return err; } EXPORT_SYMBOL(tcf_unregister_action); /* lookup by name */ static struct tc_action_ops *tc_lookup_action_n(char *kind) { struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (strcmp(kind, a->kind) == 0) { if (try_module_get(a->owner)) res = a; break; } } read_unlock(&act_mod_lock); } return res; } /* lookup by nlattr */ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) { struct tc_action_ops *a, *res = NULL; if (kind) { read_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { if (nla_strcmp(kind, a->kind) == 0) { if (try_module_get(a->owner)) res = a; break; } } read_unlock(&act_mod_lock); } return res; } /*TCA_ACT_MAX_PRIO is 32, there count upto 32 */ #define TCA_ACT_MAX_PRIO_MASK 0x1FF int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res) { u32 jmp_prgcnt = 0; u32 jmp_ttl = TCA_ACT_MAX_PRIO; /*matches actions per filter */ int i; int ret = TC_ACT_OK; if (skb_skip_tc_classify(skb)) return TC_ACT_OK; restart_act_graph: for (i = 0; i < nr_actions; i++) { const struct tc_action *a = actions[i]; int repeat_ttl; if (jmp_prgcnt > 0) { jmp_prgcnt -= 1; continue; } repeat_ttl = 32; repeat: ret = a->ops->act(skb, a, res); if (unlikely(ret == TC_ACT_REPEAT)) { if (--repeat_ttl != 0) goto repeat; /* suspicious opcode, stop pipeline */ net_warn_ratelimited("TC_ACT_REPEAT abuse ?\n"); return TC_ACT_OK; } if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) { jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK; if (!jmp_prgcnt || (jmp_prgcnt > nr_actions)) { /* faulty opcode, stop pipeline */ return TC_ACT_OK; } else { jmp_ttl -= 1; if (jmp_ttl > 0) goto restart_act_graph; else /* faulty graph, stop pipeline */ return TC_ACT_OK; } } else if (TC_ACT_EXT_CMP(ret, TC_ACT_GOTO_CHAIN)) { tcf_action_goto_chain_exec(a, res); } if (ret != TC_ACT_PIPE) break; } return ret; } EXPORT_SYMBOL(tcf_action_exec); int tcf_action_destroy(struct tc_action *actions[], int bind) { const struct tc_action_ops *ops; struct tc_action *a; int ret = 0, i; for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { a = actions[i]; actions[i] = NULL; ops = a->ops; ret = __tcf_idr_release(a, bind, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) return ret; } return ret; } static int tcf_action_destroy_1(struct tc_action *a, int bind) { struct tc_action *actions[] = { a, NULL }; return tcf_action_destroy(actions, bind); } static int tcf_action_put(struct tc_action *p) { return __tcf_action_put(p, false); } /* Put all actions in this array, skip those NULL's. */ static void tcf_action_put_many(struct tc_action *actions[]) { int i; for (i = 0; i < TCA_ACT_MAX_PRIO; i++) { struct tc_action *a = actions[i]; const struct tc_action_ops *ops; if (!a) continue; ops = a->ops; if (tcf_action_put(a)) module_put(ops->owner); } } int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { return a->ops->dump(skb, a, bind, ref); } int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { int err = -EINVAL; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; struct tc_cookie *cookie; if (nla_put_string(skb, TCA_KIND, a->ops->kind)) goto nla_put_failure; if (tcf_action_copy_stats(skb, a, 0)) goto nla_put_failure; rcu_read_lock(); cookie = rcu_dereference(a->act_cookie); if (cookie) { if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { rcu_read_unlock(); goto nla_put_failure; } } rcu_read_unlock(); nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; err = tcf_action_dump_old(skb, a, bind, ref); if (err > 0) { nla_nest_end(skb, nest); return err; } nla_put_failure: nlmsg_trim(skb, b); return -1; } EXPORT_SYMBOL(tcf_action_dump_1); int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind, int ref) { struct tc_action *a; int err = -EINVAL, i; struct nlattr *nest; for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { a = actions[i]; nest = nla_nest_start(skb, i + 1); if (nest == NULL) goto nla_put_failure; err = tcf_action_dump_1(skb, a, bind, ref); if (err < 0) goto errout; nla_nest_end(skb, nest); } return 0; nla_put_failure: err = -EINVAL; errout: nla_nest_cancel(skb, nest); return err; } static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) { struct tc_cookie *c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) return NULL; c->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL); if (!c->data) { kfree(c); return NULL; } c->len = nla_len(tb[TCA_ACT_COOKIE]); return c; } static bool tcf_action_valid(int action) { int opcode = TC_ACT_EXT_OPCODE(action); if (!opcode) return action <= TC_ACT_VALUE_MAX; return opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC; } struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, bool rtnl_held, struct netlink_ext_ack *extack) { struct tc_action *a; struct tc_action_ops *a_o; struct tc_cookie *cookie = NULL; char act_name[IFNAMSIZ]; struct nlattr *tb[TCA_ACT_MAX + 1]; struct nlattr *kind; int err; if (name == NULL) { err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack); if (err < 0) goto err_out; err = -EINVAL; kind = tb[TCA_ACT_KIND]; if (!kind) { NL_SET_ERR_MSG(extack, "TC action kind must be specified"); goto err_out; } if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) { NL_SET_ERR_MSG(extack, "TC action name too long"); goto err_out; } if (tb[TCA_ACT_COOKIE]) { int cklen = nla_len(tb[TCA_ACT_COOKIE]); if (cklen > TC_COOKIE_MAX_SIZE) { NL_SET_ERR_MSG(extack, "TC cookie size above the maximum"); goto err_out; } cookie = nla_memdup_cookie(tb); if (!cookie) { NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); err = -ENOMEM; goto err_out; } } } else { if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) { NL_SET_ERR_MSG(extack, "TC action name too long"); err = -EINVAL; goto err_out; } } a_o = tc_lookup_action_n(act_name); if (a_o == NULL) { #ifdef CONFIG_MODULES if (rtnl_held) rtnl_unlock(); request_module("act_%s", act_name); if (rtnl_held) rtnl_lock(); a_o = tc_lookup_action_n(act_name); /* We dropped the RTNL semaphore in order to * perform the module load. So, even if we * succeeded in loading the module we have to * tell the caller to replay the request. We * indicate this using -EAGAIN. */ if (a_o != NULL) { err = -EAGAIN; goto err_mod; } #endif NL_SET_ERR_MSG(extack, "Failed to load TC action module"); err = -ENOENT; goto err_out; } /* backward compatibility for policer */ if (name == NULL) err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, rtnl_held, extack); else err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, extack); if (err < 0) goto err_mod; if (!name && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->act_cookie, cookie); /* module count goes up only when brand new policy is created * if it exists and is only bound to in a_o->init() then * ACT_P_CREATED is not returned (a zero is). */ if (err != ACT_P_CREATED) module_put(a_o->owner); if (TC_ACT_EXT_CMP(a->tcfa_action, TC_ACT_GOTO_CHAIN)) { err = tcf_action_goto_chain_init(a, tp); if (err) { tcf_action_destroy_1(a, bind); NL_SET_ERR_MSG(extack, "Failed to init TC action chain"); return ERR_PTR(err); } } if (!tcf_action_valid(a->tcfa_action)) { tcf_action_destroy_1(a, bind); NL_SET_ERR_MSG(extack, "Invalid control action value"); return ERR_PTR(-EINVAL); } if (!bind && ovr && err == ACT_P_CREATED) refcount_set(&a->tcfa_refcnt, 2); return a; err_mod: module_put(a_o->owner); err_out: if (cookie) { kfree(cookie->data); kfree(cookie); } return ERR_PTR(err); } /* Returns numbers of initialized actions or negative error. */ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind, struct tc_action *actions[], size_t *attr_size, bool rtnl_held, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t sz = 0; int err; int i; err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (err < 0) return err; for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, rtnl_held, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; } act->order = i; sz += tcf_action_fill_size(act); /* Start from index 0 */ actions[i - 1] = act; } *attr_size = tcf_action_full_attrs_size(sz); return i - 1; err: tcf_action_destroy(actions, bind); return err; } int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { int err = 0; struct gnet_dump d; if (p == NULL) goto errout; /* compat_mode being true specifies a call that is supposed * to add additional backward compatibility statistic TLVs. */ if (compat_mode) { if (p->type == TCA_OLD_COMPAT) err = gnet_stats_start_copy_compat(skb, 0, TCA_STATS, TCA_XSTATS, &p->tcfa_lock, &d, TCA_PAD); else return 0; } else err = gnet_stats_start_copy(skb, TCA_ACT_STATS, &p->tcfa_lock, &d, TCA_ACT_PAD); if (err < 0) goto errout; if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfa_qstats, p->tcfa_qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) goto errout; return 0; errout: return -1; } static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, int ref) { struct tcamsg *t; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*t), flags); if (!nlh) goto out_nlmsg_trim; t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); if (!nest) goto out_nlmsg_trim; if (tcf_action_dump(skb, actions, bind, ref) < 0) goto out_nlmsg_trim; nla_nest_end(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nlmsg_trim(skb, b); return -1; } static int tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, struct tc_action *actions[], int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, 0, 1) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; } return rtnl_unicast(skb, net, portid); } static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_ACT_MAX + 1]; const struct tc_action_ops *ops; struct tc_action *a; int index; int err; err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack); if (err < 0) goto err_out; err = -EINVAL; if (tb[TCA_ACT_INDEX] == NULL || nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) { NL_SET_ERR_MSG(extack, "Invalid TC action index value"); goto err_out; } index = nla_get_u32(tb[TCA_ACT_INDEX]); err = -EINVAL; ops = tc_lookup_action(tb[TCA_ACT_KIND]); if (!ops) { /* could happen in batch of actions */ NL_SET_ERR_MSG(extack, "Specified TC action not found"); goto err_out; } err = -ENOENT; if (ops->lookup(net, &a, index, extack) == 0) goto err_mod; module_put(ops->owner); return a; err_mod: module_put(ops->owner); err_out: return ERR_PTR(err); } static int tca_action_flush(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, struct netlink_ext_ack *extack) { struct sk_buff *skb; unsigned char *b; struct nlmsghdr *nlh; struct tcamsg *t; struct netlink_callback dcb; struct nlattr *nest; struct nlattr *tb[TCA_ACT_MAX + 1]; const struct tc_action_ops *ops; struct nlattr *kind; int err = -ENOMEM; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return err; b = skb_tail_pointer(skb); err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack); if (err < 0) goto err_out; err = -EINVAL; kind = tb[TCA_ACT_KIND]; ops = tc_lookup_action(kind); if (!ops) { /*some idjot trying to flush unknown action */ NL_SET_ERR_MSG(extack, "Cannot flush unknown TC action"); goto err_out; } nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t), 0); if (!nlh) { NL_SET_ERR_MSG(extack, "Failed to create TC action flush notification"); goto out_module_put; } t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; nest = nla_nest_start(skb, TCA_ACT_TAB); if (!nest) { NL_SET_ERR_MSG(extack, "Failed to add new netlink message"); goto out_module_put; } err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops, extack); if (err <= 0) { nla_nest_cancel(skb, nest); goto out_module_put; } nla_nest_end(skb, nest); nlh->nlmsg_len = skb_tail_pointer(skb) - b; nlh->nlmsg_flags |= NLM_F_ROOT; module_put(ops->owner); err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err > 0) return 0; if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send TC action flush notification"); return err; out_module_put: module_put(ops->owner); err_out: kfree_skb(skb); return err; } static int tcf_action_delete(struct net *net, struct tc_action *actions[]) { int i; for (i = 0; i < TCA_ACT_MAX_PRIO && actions[i]; i++) { struct tc_action *a = actions[i]; const struct tc_action_ops *ops = a->ops; /* Actions can be deleted concurrently so we must save their * type and id to search again after reference is released. */ struct tcf_idrinfo *idrinfo = a->idrinfo; u32 act_index = a->tcfa_index; actions[i] = NULL; if (tcf_action_put(a)) { /* last reference, action was deleted concurrently */ module_put(ops->owner); } else { int ret; /* now do the delete */ ret = tcf_idr_delete_index(idrinfo, act_index); if (ret < 0) return ret; } } return 0; } static int tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { int ret; struct sk_buff *skb; skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, 0, 2) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return -EINVAL; } /* now do the delete */ ret = tcf_action_delete(net, actions); if (ret < 0) { NL_SET_ERR_MSG(extack, "Failed to delete TC action"); kfree_skb(skb); return ret; } ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (ret > 0) return 0; return ret; } static int tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, int event, struct netlink_ext_ack *extack) { int i, ret; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *act; size_t attr_size = 0; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack); if (ret < 0) return ret; if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { if (tb[1]) return tca_action_flush(net, tb[1], n, portid, extack); NL_SET_ERR_MSG(extack, "Invalid netlink attributes while flushing TC action"); return -EINVAL; } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { act = tcf_action_get_1(net, tb[i], n, portid, extack); if (IS_ERR(act)) { ret = PTR_ERR(act); goto err; } attr_size += tcf_action_fill_size(act); actions[i - 1] = act; } attr_size = tcf_action_full_attrs_size(attr_size); if (event == RTM_GETACTION) ret = tcf_get_notify(net, portid, n, actions, event, extack); else { /* delete */ ret = tcf_del_notify(net, n, actions, portid, attr_size, extack); if (ret) goto err; return 0; } err: tcf_action_put_many(actions); return ret; } static int tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; int err = 0; skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_NEWACTION, 0, 0) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; } err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err > 0) err = 0; return err; } static int tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 portid, int ovr, struct netlink_ext_ack *extack) { size_t attr_size = 0; int loop, ret; struct tc_action *actions[TCA_ACT_MAX_PRIO] = {}; for (loop = 0; loop < 10; loop++) { ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, actions, &attr_size, true, extack); if (ret != -EAGAIN) break; } if (ret < 0) return ret; ret = tcf_add_notify(net, n, actions, portid, attr_size, extack); if (ovr) tcf_action_put_many(actions); return ret; } static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { [TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32, .validation_data = &tcaa_root_flags_allowed }, [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 }, }; static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_ROOT_MAX + 1]; u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL, extack); if (ret < 0) return ret; if (tca[TCA_ACT_TAB] == NULL) { NL_SET_ERR_MSG(extack, "Netlink action attributes missing"); return -EINVAL; } /* n->nlmsg_flags & NLM_F_CREATE */ switch (n->nlmsg_type) { case RTM_NEWACTION: /* we are going to assume all other flags * imply create only if it doesn't exist * Note that CREATE | EXCL implies that * but since we want avoid ambiguity (eg when flags * is zero) then just set this */ if (n->nlmsg_flags & NLM_F_REPLACE) ovr = 1; ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr, extack); break; case RTM_DELACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, portid, RTM_DELACTION, extack); break; case RTM_GETACTION: ret = tca_action_gd(net, tca[TCA_ACT_TAB], n, portid, RTM_GETACTION, extack); break; default: BUG(); } return ret; } static struct nlattr *find_dump_kind(struct nlattr **nla) { struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct nlattr *kind; tb1 = nla[TCA_ACT_TAB]; if (tb1 == NULL) return NULL; if (nla_parse(tb, TCA_ACT_MAX_PRIO, nla_data(tb1), NLMSG_ALIGN(nla_len(tb1)), NULL, NULL) < 0) return NULL; if (tb[1] == NULL) return NULL; if (nla_parse_nested(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0) return NULL; kind = tb2[TCA_ACT_KIND]; return kind; } static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; struct tc_action_ops *a_o; int ret = 0; struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); struct nlattr *tb[TCA_ROOT_MAX + 1]; struct nlattr *count_attr = NULL; unsigned long jiffy_since = 0; struct nlattr *kind = NULL; struct nla_bitfield32 bf; u32 msecs_since = 0; u32 act_count = 0; ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX, tcaa_policy, NULL); if (ret < 0) return ret; kind = find_dump_kind(tb); if (kind == NULL) { pr_info("tc_dump_action: action bad kind\n"); return 0; } a_o = tc_lookup_action(kind); if (a_o == NULL) return 0; cb->args[2] = 0; if (tb[TCA_ROOT_FLAGS]) { bf = nla_get_bitfield32(tb[TCA_ROOT_FLAGS]); cb->args[2] = bf.value; } if (tb[TCA_ROOT_TIME_DELTA]) { msecs_since = nla_get_u32(tb[TCA_ROOT_TIME_DELTA]); } nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*t), 0); if (!nlh) goto out_module_put; if (msecs_since) jiffy_since = jiffies - msecs_to_jiffies(msecs_since); t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; cb->args[3] = jiffy_since; count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32)); if (!count_attr) goto out_module_put; nest = nla_nest_start(skb, TCA_ACT_TAB); if (nest == NULL) goto out_module_put; ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o, NULL); if (ret < 0) goto out_module_put; if (ret > 0) { nla_nest_end(skb, nest); ret = skb->len; act_count = cb->args[1]; memcpy(nla_data(count_attr), &act_count, sizeof(u32)); cb->args[1] = 0; } else nlmsg_trim(skb, b); nlh->nlmsg_len = skb_tail_pointer(skb) - b; if (NETLINK_CB(cb->skb).portid && ret) nlh->nlmsg_flags |= NLM_F_MULTI; module_put(a_o->owner); return skb->len; out_module_put: module_put(a_o->owner); nlmsg_trim(skb, b); return skb->len; } struct tcf_action_net { struct rhashtable egdev_ht; }; static unsigned int tcf_action_net_id; struct tcf_action_egdev_cb { struct list_head list; tc_setup_cb_t *cb; void *cb_priv; }; struct tcf_action_egdev { struct rhash_head ht_node; const struct net_device *dev; unsigned int refcnt; struct list_head cb_list; }; static const struct rhashtable_params tcf_action_egdev_ht_params = { .key_offset = offsetof(struct tcf_action_egdev, dev), .head_offset = offsetof(struct tcf_action_egdev, ht_node), .key_len = sizeof(const struct net_device *), }; static struct tcf_action_egdev * tcf_action_egdev_lookup(const struct net_device *dev) { struct net *net = dev_net(dev); struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); return rhashtable_lookup_fast(&tan->egdev_ht, &dev, tcf_action_egdev_ht_params); } static struct tcf_action_egdev * tcf_action_egdev_get(const struct net_device *dev) { struct tcf_action_egdev *egdev; struct tcf_action_net *tan; egdev = tcf_action_egdev_lookup(dev); if (egdev) goto inc_ref; egdev = kzalloc(sizeof(*egdev), GFP_KERNEL); if (!egdev) return NULL; INIT_LIST_HEAD(&egdev->cb_list); egdev->dev = dev; tan = net_generic(dev_net(dev), tcf_action_net_id); rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node, tcf_action_egdev_ht_params); inc_ref: egdev->refcnt++; return egdev; } static void tcf_action_egdev_put(struct tcf_action_egdev *egdev) { struct tcf_action_net *tan; if (--egdev->refcnt) return; tan = net_generic(dev_net(egdev->dev), tcf_action_net_id); rhashtable_remove_fast(&tan->egdev_ht, &egdev->ht_node, tcf_action_egdev_ht_params); kfree(egdev); } static struct tcf_action_egdev_cb * tcf_action_egdev_cb_lookup(struct tcf_action_egdev *egdev, tc_setup_cb_t *cb, void *cb_priv) { struct tcf_action_egdev_cb *egdev_cb; list_for_each_entry(egdev_cb, &egdev->cb_list, list) if (egdev_cb->cb == cb && egdev_cb->cb_priv == cb_priv) return egdev_cb; return NULL; } static int tcf_action_egdev_cb_call(struct tcf_action_egdev *egdev, enum tc_setup_type type, void *type_data, bool err_stop) { struct tcf_action_egdev_cb *egdev_cb; int ok_count = 0; int err; list_for_each_entry(egdev_cb, &egdev->cb_list, list) { err = egdev_cb->cb(type, type_data, egdev_cb->cb_priv); if (err) { if (err_stop) return err; } else { ok_count++; } } return ok_count; } static int tcf_action_egdev_cb_add(struct tcf_action_egdev *egdev, tc_setup_cb_t *cb, void *cb_priv) { struct tcf_action_egdev_cb *egdev_cb; egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); if (WARN_ON(egdev_cb)) return -EEXIST; egdev_cb = kzalloc(sizeof(*egdev_cb), GFP_KERNEL); if (!egdev_cb) return -ENOMEM; egdev_cb->cb = cb; egdev_cb->cb_priv = cb_priv; list_add(&egdev_cb->list, &egdev->cb_list); return 0; } static void tcf_action_egdev_cb_del(struct tcf_action_egdev *egdev, tc_setup_cb_t *cb, void *cb_priv) { struct tcf_action_egdev_cb *egdev_cb; egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); if (WARN_ON(!egdev_cb)) return; list_del(&egdev_cb->list); kfree(egdev_cb); } static int __tc_setup_cb_egdev_register(const struct net_device *dev, tc_setup_cb_t *cb, void *cb_priv) { struct tcf_action_egdev *egdev = tcf_action_egdev_get(dev); int err; if (!egdev) return -ENOMEM; err = tcf_action_egdev_cb_add(egdev, cb, cb_priv); if (err) goto err_cb_add; return 0; err_cb_add: tcf_action_egdev_put(egdev); return err; } int tc_setup_cb_egdev_register(const struct net_device *dev, tc_setup_cb_t *cb, void *cb_priv) { int err; rtnl_lock(); err = __tc_setup_cb_egdev_register(dev, cb, cb_priv); rtnl_unlock(); return err; } EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_register); static void __tc_setup_cb_egdev_unregister(const struct net_device *dev, tc_setup_cb_t *cb, void *cb_priv) { struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); if (WARN_ON(!egdev)) return; tcf_action_egdev_cb_del(egdev, cb, cb_priv); tcf_action_egdev_put(egdev); } void tc_setup_cb_egdev_unregister(const struct net_device *dev, tc_setup_cb_t *cb, void *cb_priv) { rtnl_lock(); __tc_setup_cb_egdev_unregister(dev, cb, cb_priv); rtnl_unlock(); } EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_unregister); int tc_setup_cb_egdev_call(const struct net_device *dev, enum tc_setup_type type, void *type_data, bool err_stop) { struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); if (!egdev) return 0; return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop); } EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call); static __net_init int tcf_action_net_init(struct net *net) { struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); return rhashtable_init(&tan->egdev_ht, &tcf_action_egdev_ht_params); } static void __net_exit tcf_action_net_exit(struct net *net) { struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); rhashtable_destroy(&tan->egdev_ht); } static struct pernet_operations tcf_action_net_ops = { .init = tcf_action_net_init, .exit = tcf_action_net_exit, .id = &tcf_action_net_id, .size = sizeof(struct tcf_action_net), }; static int __init tc_action_init(void) { int err; err = register_pernet_subsys(&tcf_action_net_ops); if (err) return err; rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action, 0); return 0; } subsys_initcall(tc_action_init); |
4 4 4 6 4 4 3 3 4 4 1 6 5 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 | /** * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2003 Erez Zadok * Copyright (C) 2001-2003 Stony Brook University * Copyright (C) 2004-2007 International Business Machines Corp. * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> * Michael C. Thompson <mcthomps@us.ibm.com> * Tyler Hicks <tyhicks@ou.edu> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. */ #include <linux/dcache.h> #include <linux/file.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/skbuff.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/key.h> #include <linux/parser.h> #include <linux/fs_stack.h> #include <linux/slab.h> #include <linux/magic.h> #include "ecryptfs_kernel.h" /** * Module parameter that defines the ecryptfs_verbosity level. */ int ecryptfs_verbosity = 0; module_param(ecryptfs_verbosity, int, 0); MODULE_PARM_DESC(ecryptfs_verbosity, "Initial verbosity level (0 or 1; defaults to " "0, which is Quiet)"); /** * Module parameter that defines the number of message buffer elements */ unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; module_param(ecryptfs_message_buf_len, uint, 0); MODULE_PARM_DESC(ecryptfs_message_buf_len, "Number of message buffer elements"); /** * Module parameter that defines the maximum guaranteed amount of time to wait * for a response from ecryptfsd. The actual sleep time will be, more than * likely, a small amount greater than this specified value, but only less if * the message successfully arrives. */ signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ; module_param(ecryptfs_message_wait_timeout, long, 0); MODULE_PARM_DESC(ecryptfs_message_wait_timeout, "Maximum number of seconds that an operation will " "sleep while waiting for a message response from " "userspace"); /** * Module parameter that is an estimate of the maximum number of users * that will be concurrently using eCryptfs. Set this to the right * value to balance performance and memory use. */ unsigned int ecryptfs_number_of_users = ECRYPTFS_DEFAULT_NUM_USERS; module_param(ecryptfs_number_of_users, uint, 0); MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of " "concurrent users of eCryptfs"); void __ecryptfs_printk(const char *fmt, ...) { va_list args; va_start(args, fmt); if (fmt[1] == '7') { /* KERN_DEBUG */ if (ecryptfs_verbosity >= 1) vprintk(fmt, args); } else vprintk(fmt, args); va_end(args); } /** * ecryptfs_init_lower_file * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with * the lower dentry and the lower mount set * * eCryptfs only ever keeps a single open file for every lower * inode. All I/O operations to the lower inode occur through that * file. When the first eCryptfs dentry that interposes with the first * lower dentry for that inode is created, this function creates the * lower file struct and associates it with the eCryptfs * inode. When all eCryptfs files associated with the inode are released, the * file is closed. * * The lower file will be opened with read/write permissions, if * possible. Otherwise, it is opened read-only. * * This function does nothing if a lower file is already * associated with the eCryptfs inode. * * Returns zero on success; non-zero otherwise */ static int ecryptfs_init_lower_file(struct dentry *dentry, struct file **lower_file) { const struct cred *cred = current_cred(); struct path *path = ecryptfs_dentry_to_lower_path(dentry); int rc; rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt, cred); if (rc) { printk(KERN_ERR "Error opening lower file " "for lower_dentry [0x%p] and lower_mnt [0x%p]; " "rc = [%d]\n", path->dentry, path->mnt, rc); (*lower_file) = NULL; } return rc; } int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode) { struct ecryptfs_inode_info *inode_info; int count, rc = 0; inode_info = ecryptfs_inode_to_private(inode); mutex_lock(&inode_info->lower_file_mutex); count = atomic_inc_return(&inode_info->lower_file_count); if (WARN_ON_ONCE(count < 1)) rc = -EINVAL; else if (count == 1) { rc = ecryptfs_init_lower_file(dentry, &inode_info->lower_file); if (rc) atomic_set(&inode_info->lower_file_count, 0); } mutex_unlock(&inode_info->lower_file_mutex); return rc; } void ecryptfs_put_lower_file(struct inode *inode) { struct ecryptfs_inode_info *inode_info; inode_info = ecryptfs_inode_to_private(inode); if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count, &inode_info->lower_file_mutex)) { filemap_write_and_wait(inode->i_mapping); fput(inode_info->lower_file); inode_info->lower_file = NULL; mutex_unlock(&inode_info->lower_file_mutex); } } enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, ecryptfs_opt_check_dev_ruid, ecryptfs_opt_err }; static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, {ecryptfs_opt_cipher, "cipher=%s"}, {ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"}, {ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"}, {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"}, {ecryptfs_opt_err, NULL} }; static int ecryptfs_init_global_auth_toks( struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { struct ecryptfs_global_auth_tok *global_auth_tok; struct ecryptfs_auth_tok *auth_tok; int rc = 0; list_for_each_entry(global_auth_tok, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { rc = ecryptfs_keyring_auth_tok_for_sig( &global_auth_tok->global_auth_tok_key, &auth_tok, global_auth_tok->sig); if (rc) { printk(KERN_ERR "Could not find valid key in user " "session keyring for sig specified in mount " "option: [%s]\n", global_auth_tok->sig); global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID; goto out; } else { global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID; up_write(&(global_auth_tok->global_auth_tok_key)->sem); } } out: return rc; } static void ecryptfs_init_mount_crypt_stat( struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { memset((void *)mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat)); INIT_LIST_HEAD(&mount_crypt_stat->global_auth_tok_list); mutex_init(&mount_crypt_stat->global_auth_tok_list_mutex); mount_crypt_stat->flags |= ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED; } /** * ecryptfs_parse_options * @sb: The ecryptfs super block * @options: The options passed to the kernel * @check_ruid: set to 1 if device uid should be checked against the ruid * * Parse mount options: * debug=N - ecryptfs_verbosity level for debug output * sig=XXX - description(signature) of the key to use * * Returns the dentry object of the lower-level (lower/interposed) * directory; We want to mount our stackable file system on top of * that lower directory. * * The signature of the key to use must be the description of a key * already in the keyring. Mounting will fail if the key can not be * found. * * Returns zero on success; non-zero on error */ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, uid_t *check_ruid) { char *p; int rc = 0; int sig_set = 0; int cipher_name_set = 0; int fn_cipher_name_set = 0; int cipher_key_bytes; int cipher_key_bytes_set = 0; int fn_cipher_key_bytes; int fn_cipher_key_bytes_set = 0; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &sbi->mount_crypt_stat; substring_t args[MAX_OPT_ARGS]; int token; char *sig_src; char *cipher_name_dst; char *cipher_name_src; char *fn_cipher_name_dst; char *fn_cipher_name_src; char *fnek_dst; char *fnek_src; char *cipher_key_bytes_src; char *fn_cipher_key_bytes_src; u8 cipher_code; *check_ruid = 0; if (!options) { rc = -EINVAL; goto out; } ecryptfs_init_mount_crypt_stat(mount_crypt_stat); while ((p = strsep(&options, ",")) != NULL) { if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case ecryptfs_opt_sig: case ecryptfs_opt_ecryptfs_sig: sig_src = args[0].from; rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, sig_src, 0); if (rc) { printk(KERN_ERR "Error attempting to register " "global sig; rc = [%d]\n", rc); goto out; } sig_set = 1; break; case ecryptfs_opt_cipher: case ecryptfs_opt_ecryptfs_cipher: cipher_name_src = args[0].from; cipher_name_dst = mount_crypt_stat-> global_default_cipher_name; strncpy(cipher_name_dst, cipher_name_src, ECRYPTFS_MAX_CIPHER_NAME_SIZE); cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; cipher_name_set = 1; break; case ecryptfs_opt_ecryptfs_key_bytes: cipher_key_bytes_src = args[0].from; cipher_key_bytes = (int)simple_strtol(cipher_key_bytes_src, &cipher_key_bytes_src, 0); mount_crypt_stat->global_default_cipher_key_size = cipher_key_bytes; cipher_key_bytes_set = 1; break; case ecryptfs_opt_passthrough: mount_crypt_stat->flags |= ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED; break; case ecryptfs_opt_xattr_metadata: mount_crypt_stat->flags |= ECRYPTFS_XATTR_METADATA_ENABLED; break; case ecryptfs_opt_encrypted_view: mount_crypt_stat->flags |= ECRYPTFS_XATTR_METADATA_ENABLED; mount_crypt_stat->flags |= ECRYPTFS_ENCRYPTED_VIEW_ENABLED; break; case ecryptfs_opt_fnek_sig: fnek_src = args[0].from; fnek_dst = mount_crypt_stat->global_default_fnek_sig; strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX); mount_crypt_stat->global_default_fnek_sig[ ECRYPTFS_SIG_SIZE_HEX] = '\0'; rc = ecryptfs_add_global_auth_tok( mount_crypt_stat, mount_crypt_stat->global_default_fnek_sig, ECRYPTFS_AUTH_TOK_FNEK); if (rc) { printk(KERN_ERR "Error attempting to register " "global fnek sig [%s]; rc = [%d]\n", mount_crypt_stat->global_default_fnek_sig, rc); goto out; } mount_crypt_stat->flags |= (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK); break; case ecryptfs_opt_fn_cipher: fn_cipher_name_src = args[0].from; fn_cipher_name_dst = mount_crypt_stat->global_default_fn_cipher_name; strncpy(fn_cipher_name_dst, fn_cipher_name_src, ECRYPTFS_MAX_CIPHER_NAME_SIZE); mount_crypt_stat->global_default_fn_cipher_name[ ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; fn_cipher_name_set = 1; break; case ecryptfs_opt_fn_cipher_key_bytes: fn_cipher_key_bytes_src = args[0].from; fn_cipher_key_bytes = (int)simple_strtol(fn_cipher_key_bytes_src, &fn_cipher_key_bytes_src, 0); mount_crypt_stat->global_default_fn_cipher_key_bytes = fn_cipher_key_bytes; fn_cipher_key_bytes_set = 1; break; case ecryptfs_opt_unlink_sigs: mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; break; case ecryptfs_opt_mount_auth_tok_only: mount_crypt_stat->flags |= ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; break; case ecryptfs_opt_check_dev_ruid: *check_ruid = 1; break; case ecryptfs_opt_err: default: printk(KERN_WARNING "%s: eCryptfs: unrecognized option [%s]\n", __func__, p); } } if (!sig_set) { rc = -EINVAL; ecryptfs_printk(KERN_ERR, "You must supply at least one valid " "auth tok signature as a mount " "parameter; see the eCryptfs README\n"); goto out; } if (!cipher_name_set) { int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE); strcpy(mount_crypt_stat->global_default_cipher_name, ECRYPTFS_DEFAULT_CIPHER); } if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && !fn_cipher_name_set) strcpy(mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_cipher_name); if (!cipher_key_bytes_set) mount_crypt_stat->global_default_cipher_key_size = 0; if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && !fn_cipher_key_bytes_set) mount_crypt_stat->global_default_fn_cipher_key_bytes = mount_crypt_stat->global_default_cipher_key_size; cipher_code = ecryptfs_code_for_cipher_string( mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size); if (!cipher_code) { ecryptfs_printk(KERN_ERR, "eCryptfs doesn't support cipher: %s\n", mount_crypt_stat->global_default_cipher_name); rc = -EINVAL; goto out; } mutex_lock(&key_tfm_list_mutex); if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, NULL)) { rc = ecryptfs_add_new_key_tfm( NULL, mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size); if (rc) { printk(KERN_ERR "Error attempting to initialize " "cipher with name = [%s] and key size = [%td]; " "rc = [%d]\n", mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size, rc); rc = -EINVAL; mutex_unlock(&key_tfm_list_mutex); goto out; } } if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && !ecryptfs_tfm_exists( mount_crypt_stat->global_default_fn_cipher_name, NULL)) { rc = ecryptfs_add_new_key_tfm( NULL, mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_fn_cipher_key_bytes); if (rc) { printk(KERN_ERR "Error attempting to initialize " "cipher with name = [%s] and key size = [%td]; " "rc = [%d]\n", mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_fn_cipher_key_bytes, rc); rc = -EINVAL; mutex_unlock(&key_tfm_list_mutex); goto out; } } mutex_unlock(&key_tfm_list_mutex); rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); if (rc) printk(KERN_WARNING "One or more global auth toks could not " "properly register; rc = [%d]\n", rc); out: return rc; } struct kmem_cache *ecryptfs_sb_info_cache; static struct file_system_type ecryptfs_fs_type; /** * ecryptfs_get_sb * @fs_type * @flags * @dev_name: The path to mount over * @raw_data: The options passed into the kernel */ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) { struct super_block *s; struct ecryptfs_sb_info *sbi; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; struct inode *inode; struct path path; uid_t check_ruid; int rc; sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); if (!sbi) { rc = -ENOMEM; goto out; } if (!dev_name) { rc = -EINVAL; err = "Device name cannot be null"; goto out; } rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid); if (rc) { err = "Error parsing options"; goto out; } mount_crypt_stat = &sbi->mount_crypt_stat; s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) { rc = PTR_ERR(s); goto out; } rc = super_setup_bdi(s); if (rc) goto out1; ecryptfs_set_superblock_private(s, sbi); /* ->kill_sb() will take care of sbi after that point */ sbi = NULL; s->s_op = &ecryptfs_sops; s->s_xattr = ecryptfs_xattr_handlers; s->s_d_op = &ecryptfs_dops; err = "Reading sb failed"; rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); if (rc) { ecryptfs_printk(KERN_WARNING, "kern_path() failed\n"); goto out1; } if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) { rc = -EINVAL; printk(KERN_ERR "Mount on filesystem of type " "eCryptfs explicitly disallowed due to " "known incompatibilities\n"); goto out_free; } if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) { rc = -EPERM; printk(KERN_ERR "Mount of device (uid: %d) not owned by " "requested user (uid: %d)\n", i_uid_read(d_inode(path.dentry)), from_kuid(&init_user_ns, current_uid())); goto out_free; } ecryptfs_set_superblock_lower(s, path.dentry->d_sb); /** * Set the POSIX ACL flag based on whether they're enabled in the lower * mount. */ s->s_flags = flags & ~SB_POSIXACL; s->s_flags |= path.dentry->d_sb->s_flags & SB_POSIXACL; /** * Force a read-only eCryptfs mount when: * 1) The lower mount is ro * 2) The ecryptfs_encrypted_view mount option is specified */ if (sb_rdonly(path.dentry->d_sb) || mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) s->s_flags |= SB_RDONLY; s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; s->s_magic = ECRYPTFS_SUPER_MAGIC; s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; rc = -EINVAL; if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { pr_err("eCryptfs: maximum fs stacking depth exceeded\n"); goto out_free; } inode = ecryptfs_get_inode(d_inode(path.dentry), s); rc = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free; s->s_root = d_make_root(inode); if (!s->s_root) { rc = -ENOMEM; goto out_free; } rc = -ENOMEM; root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); if (!root_info) goto out_free; /* ->kill_sb() will take care of root_info */ ecryptfs_set_dentry_private(s->s_root, root_info); root_info->lower_path = path; s->s_flags |= SB_ACTIVE; return dget(s->s_root); out_free: path_put(&path); out1: deactivate_locked_super(s); out: if (sbi) { ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sbi); } printk(KERN_ERR "%s; rc = [%d]\n", err, rc); return ERR_PTR(rc); } /** * ecryptfs_kill_block_super * @sb: The ecryptfs super block * * Used to bring the superblock down and free the private data. */ static void ecryptfs_kill_block_super(struct super_block *sb) { struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); kill_anon_super(sb); if (!sb_info) return; ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } static struct file_system_type ecryptfs_fs_type = { .owner = THIS_MODULE, .name = "ecryptfs", .mount = ecryptfs_mount, .kill_sb = ecryptfs_kill_block_super, .fs_flags = 0 }; MODULE_ALIAS_FS("ecryptfs"); /** * inode_info_init_once * * Initializes the ecryptfs_inode_info_cache when it is created */ static void inode_info_init_once(void *vptr) { struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr; inode_init_once(&ei->vfs_inode); } static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; slab_flags_t flags; void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { .cache = &ecryptfs_auth_tok_list_item_cache, .name = "ecryptfs_auth_tok_list_item", .size = sizeof(struct ecryptfs_auth_tok_list_item), }, { .cache = &ecryptfs_file_info_cache, .name = "ecryptfs_file_cache", .size = sizeof(struct ecryptfs_file_info), }, { .cache = &ecryptfs_dentry_info_cache, .name = "ecryptfs_dentry_info_cache", .size = sizeof(struct ecryptfs_dentry_info), }, { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", .size = sizeof(struct ecryptfs_inode_info), .flags = SLAB_ACCOUNT, .ctor = inode_info_init_once, }, { .cache = &ecryptfs_sb_info_cache, .name = "ecryptfs_sb_cache", .size = sizeof(struct ecryptfs_sb_info), }, { .cache = &ecryptfs_header_cache, .name = "ecryptfs_headers", .size = PAGE_SIZE, }, { .cache = &ecryptfs_xattr_cache, .name = "ecryptfs_xattr_cache", .size = PAGE_SIZE, }, { .cache = &ecryptfs_key_record_cache, .name = "ecryptfs_key_record_cache", .size = sizeof(struct ecryptfs_key_record), }, { .cache = &ecryptfs_key_sig_cache, .name = "ecryptfs_key_sig_cache", .size = sizeof(struct ecryptfs_key_sig), }, { .cache = &ecryptfs_global_auth_tok_cache, .name = "ecryptfs_global_auth_tok_cache", .size = sizeof(struct ecryptfs_global_auth_tok), }, { .cache = &ecryptfs_key_tfm_cache, .name = "ecryptfs_key_tfm_cache", .size = sizeof(struct ecryptfs_key_tfm), }, }; static void ecryptfs_free_kmem_caches(void) { int i; /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; kmem_cache_destroy(*(info->cache)); } } /** * ecryptfs_init_kmem_caches * * Returns zero on success; non-zero otherwise */ static int ecryptfs_init_kmem_caches(void) { int i; for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; *(info->cache) = kmem_cache_create(info->name, info->size, 0, SLAB_HWCACHE_ALIGN | info->flags, info->ctor); if (!*(info->cache)) { ecryptfs_free_kmem_caches(); ecryptfs_printk(KERN_WARNING, "%s: " "kmem_cache_create failed\n", info->name); return -ENOMEM; } } return 0; } static struct kobject *ecryptfs_kobj; static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buff) { return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK); } static struct kobj_attribute version_attr = __ATTR_RO(version); static struct attribute *attributes[] = { &version_attr.attr, NULL, }; static const struct attribute_group attr_group = { .attrs = attributes, }; static int do_sysfs_registration(void) { int rc; ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj); if (!ecryptfs_kobj) { printk(KERN_ERR "Unable to create ecryptfs kset\n"); rc = -ENOMEM; goto out; } rc = sysfs_create_group(ecryptfs_kobj, &attr_group); if (rc) { printk(KERN_ERR "Unable to create ecryptfs version attributes\n"); kobject_put(ecryptfs_kobj); } out: return rc; } static void do_sysfs_unregistration(void) { sysfs_remove_group(ecryptfs_kobj, &attr_group); kobject_put(ecryptfs_kobj); } static int __init ecryptfs_init(void) { int rc; if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_SIZE) { rc = -EINVAL; ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " "larger than the host's page size, and so " "eCryptfs cannot run on this system. The " "default eCryptfs extent size is [%u] bytes; " "the page size is [%lu] bytes.\n", ECRYPTFS_DEFAULT_EXTENT_SIZE, (unsigned long)PAGE_SIZE); goto out; } rc = ecryptfs_init_kmem_caches(); if (rc) { printk(KERN_ERR "Failed to allocate one or more kmem_cache objects\n"); goto out; } rc = do_sysfs_registration(); if (rc) { printk(KERN_ERR "sysfs registration failed\n"); goto out_free_kmem_caches; } rc = ecryptfs_init_kthread(); if (rc) { printk(KERN_ERR "%s: kthread initialization failed; " "rc = [%d]\n", __func__, rc); goto out_do_sysfs_unregistration; } rc = ecryptfs_init_messaging(); if (rc) { printk(KERN_ERR "Failure occurred while attempting to " "initialize the communications channel to " "ecryptfsd\n"); goto out_destroy_kthread; } rc = ecryptfs_init_crypto(); if (rc) { printk(KERN_ERR "Failure whilst attempting to init crypto; " "rc = [%d]\n", rc); goto out_release_messaging; } rc = register_filesystem(&ecryptfs_fs_type); if (rc) { printk(KERN_ERR "Failed to register filesystem\n"); goto out_destroy_crypto; } if (ecryptfs_verbosity > 0) printk(KERN_CRIT "eCryptfs verbosity set to %d. Secret values " "will be written to the syslog!\n", ecryptfs_verbosity); goto out; out_destroy_crypto: ecryptfs_destroy_crypto(); out_release_messaging: ecryptfs_release_messaging(); out_destroy_kthread: ecryptfs_destroy_kthread(); out_do_sysfs_unregistration: do_sysfs_unregistration(); out_free_kmem_caches: ecryptfs_free_kmem_caches(); out: return rc; } static void __exit ecryptfs_exit(void) { int rc; rc = ecryptfs_destroy_crypto(); if (rc) printk(KERN_ERR "Failure whilst attempting to destroy crypto; " "rc = [%d]\n", rc); ecryptfs_release_messaging(); ecryptfs_destroy_kthread(); do_sysfs_unregistration(); unregister_filesystem(&ecryptfs_fs_type); ecryptfs_free_kmem_caches(); } MODULE_AUTHOR("Michael A. Halcrow <mhalcrow@us.ibm.com>"); MODULE_DESCRIPTION("eCryptfs"); MODULE_LICENSE("GPL"); module_init(ecryptfs_init) module_exit(ecryptfs_exit) |
66 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | /* Definitions for key type implementations * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public Licence * as published by the Free Software Foundation; either version * 2 of the Licence, or (at your option) any later version. */ #ifndef _LINUX_KEY_TYPE_H #define _LINUX_KEY_TYPE_H #include <linux/key.h> #include <linux/errno.h> #ifdef CONFIG_KEYS /* * Pre-parsed payload, used by key add, update and instantiate. * * This struct will be cleared and data and datalen will be set with the data * and length parameters from the caller and quotalen will be set from * def_datalen from the key type. Then if the preparse() op is provided by the * key type, that will be called. Then the struct will be passed to the * instantiate() or the update() op. * * If the preparse() op is given, the free_preparse() op will be called to * clear the contents. */ struct key_preparsed_payload { char *description; /* Proposed key description (or NULL) */ union key_payload payload; /* Proposed payload */ const void *data; /* Raw data */ size_t datalen; /* Raw datalen */ size_t quotalen; /* Quota length for proposed payload */ time64_t expiry; /* Expiry time of key */ } __randomize_layout; typedef int (*request_key_actor_t)(struct key *auth_key, void *aux); /* * Preparsed matching criterion. */ struct key_match_data { /* Comparison function, defaults to exact description match, but can be * overridden by type->match_preparse(). Should return true if a match * is found and false if not. */ bool (*cmp)(const struct key *key, const struct key_match_data *match_data); const void *raw_data; /* Raw match data */ void *preparsed; /* For ->match_preparse() to stash stuff */ unsigned lookup_type; /* Type of lookup for this search. */ #define KEYRING_SEARCH_LOOKUP_DIRECT 0x0000 /* Direct lookup by description. */ #define KEYRING_SEARCH_LOOKUP_ITERATE 0x0001 /* Iterative search. */ }; /* * kernel managed key type definition */ struct key_type { /* name of the type */ const char *name; /* default payload length for quota precalculation (optional) * - this can be used instead of calling key_payload_reserve(), that * function only needs to be called if the real datalen is different */ size_t def_datalen; /* vet a description */ int (*vet_description)(const char *description); /* Preparse the data blob from userspace that is to be the payload, * generating a proposed description and payload that will be handed to * the instantiate() and update() ops. */ int (*preparse)(struct key_preparsed_payload *prep); /* Free a preparse data structure. */ void (*free_preparse)(struct key_preparsed_payload *prep); /* instantiate a key of this type * - this method should call key_payload_reserve() to determine if the * user's quota will hold the payload */ int (*instantiate)(struct key *key, struct key_preparsed_payload *prep); /* update a key of this type (optional) * - this method should call key_payload_reserve() to recalculate the * quota consumption * - the key must be locked against read when modifying */ int (*update)(struct key *key, struct key_preparsed_payload *prep); /* Preparse the data supplied to ->match() (optional). The * data to be preparsed can be found in match_data->raw_data. * The lookup type can also be set by this function. */ int (*match_preparse)(struct key_match_data *match_data); /* Free preparsed match data (optional). This should be supplied it * ->match_preparse() is supplied. */ void (*match_free)(struct key_match_data *match_data); /* clear some of the data from a key on revokation (optional) * - the key's semaphore will be write-locked by the caller */ void (*revoke)(struct key *key); /* clear the data from a key (optional) */ void (*destroy)(struct key *key); /* describe a key */ void (*describe)(const struct key *key, struct seq_file *p); /* read a key's data (optional) * - permission checks will be done by the caller * - the key's semaphore will be readlocked by the caller * - should return the amount of data that could be read, no matter how * much is copied into the buffer * - shouldn't do the copy if the buffer is NULL */ long (*read)(const struct key *key, char *buffer, size_t buflen); /* handle request_key() for this type instead of invoking * /sbin/request-key (optional) * - key is the key to instantiate * - authkey is the authority to assume when instantiating this key * - op is the operation to be done, usually "create" * - the call must not return until the instantiation process has run * its course */ request_key_actor_t request_key; /* Look up a keyring access restriction (optional) * * - NULL is a valid return value (meaning the requested restriction * is known but will never block addition of a key) * - should return -EINVAL if the restriction is unknown */ struct key_restriction *(*lookup_restriction)(const char *params); /* internal fields */ struct list_head link; /* link in types list */ struct lock_class_key lock_class; /* key->sem lock class */ } __randomize_layout; extern struct key_type key_type_keyring; extern int register_key_type(struct key_type *ktype); extern void unregister_key_type(struct key_type *ktype); extern int key_payload_reserve(struct key *key, size_t datalen); extern int key_instantiate_and_link(struct key *key, const void *data, size_t datalen, struct key *keyring, struct key *authkey); extern int key_reject_and_link(struct key *key, unsigned timeout, unsigned error, struct key *keyring, struct key *authkey); extern void complete_request_key(struct key *authkey, int error); static inline int key_negate_and_link(struct key *key, unsigned timeout, struct key *keyring, struct key *authkey) { return key_reject_and_link(key, timeout, ENOKEY, keyring, authkey); } extern int generic_key_instantiate(struct key *key, struct key_preparsed_payload *prep); #endif /* CONFIG_KEYS */ #endif /* _LINUX_KEY_TYPE_H */ |
162 161 162 162 87 162 86 87 87 75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 | /* * kernel/stop_machine.c * * Copyright (C) 2008, 2005 IBM Corporation. * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo <tj@kernel.org> * * This file is released under the GPLv2 and any later version. */ #include <linux/completion.h> #include <linux/cpu.h> #include <linux/init.h> #include <linux/kthread.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/sched.h> #include <linux/stop_machine.h> #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/smpboot.h> #include <linux/atomic.h> #include <linux/nmi.h> #include <linux/sched/wake_q.h> /* * Structure to determine completion condition and record errors. May * be shared by works on different cpus. */ struct cpu_stop_done { atomic_t nr_todo; /* nr left to execute */ int ret; /* collected return value */ struct completion completion; /* fired if nr_todo reaches 0 */ }; /* the actual stopper, one per every possible cpu, enabled on online cpus */ struct cpu_stopper { struct task_struct *thread; raw_spinlock_t lock; bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ struct cpu_stop_work stop_work; /* for stop_cpus */ }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static bool stop_machine_initialized = false; /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); static bool stop_cpus_in_progress; static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) { memset(done, 0, sizeof(*done)); atomic_set(&done->nr_todo, nr_todo); init_completion(&done->completion); } /* signal completion unless @done is NULL */ static void cpu_stop_signal_done(struct cpu_stop_done *done) { if (atomic_dec_and_test(&done->nr_todo)) complete(&done->completion); } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, struct cpu_stop_work *work, struct wake_q_head *wakeq) { list_add_tail(&work->list, &stopper->works); wake_q_add(wakeq, stopper->thread); } /* queue @work to @stopper. if offline, @work is completed immediately */ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); DEFINE_WAKE_Q(wakeq); unsigned long flags; bool enabled; preempt_disable(); raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) __cpu_stop_queue_work(stopper, work, &wakeq); else if (work->done) cpu_stop_signal_done(work->done); raw_spin_unlock_irqrestore(&stopper->lock, flags); wake_up_q(&wakeq); preempt_enable(); return enabled; } /** * stop_one_cpu - stop a cpu * @cpu: cpu to stop * @fn: function to execute * @arg: argument to @fn * * Execute @fn(@arg) on @cpu. @fn is run in a process context with * the highest priority preempting any task on the cpu and * monopolizing it. This function returns after the execution is * complete. * * This function doesn't guarantee @cpu stays online till @fn * completes. If @cpu goes down in the middle, execution may happen * partially or fully on different cpus. @fn should either be ready * for that or the caller should ensure that @cpu stays online until * this function completes. * * CONTEXT: * Might sleep. * * RETURNS: * -ENOENT if @fn(@arg) was not executed because @cpu was offline; * otherwise, the return value of @fn. */ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) { struct cpu_stop_done done; struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; cpu_stop_init_done(&done, 1); if (!cpu_stop_queue_work(cpu, &work)) return -ENOENT; /* * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup * cycle by doing a preemption: */ cond_resched(); wait_for_completion(&done.completion); return done.ret; } /* This controls the threads on each CPU. */ enum multi_stop_state { /* Dummy starting state for thread. */ MULTI_STOP_NONE, /* Awaiting everyone to be scheduled. */ MULTI_STOP_PREPARE, /* Disable interrupts. */ MULTI_STOP_DISABLE_IRQ, /* Run the function */ MULTI_STOP_RUN, /* Exit */ MULTI_STOP_EXIT, }; struct multi_stop_data { cpu_stop_fn_t fn; void *data; /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ unsigned int num_threads; const struct cpumask *active_cpus; enum multi_stop_state state; atomic_t thread_ack; }; static void set_state(struct multi_stop_data *msdata, enum multi_stop_state newstate) { /* Reset ack counter. */ atomic_set(&msdata->thread_ack, msdata->num_threads); smp_wmb(); msdata->state = newstate; } /* Last one to ack a state moves to the next state. */ static void ack_state(struct multi_stop_data *msdata) { if (atomic_dec_and_test(&msdata->thread_ack)) set_state(msdata, msdata->state + 1); } /* This is the cpu_stop function which stops the CPU. */ static int multi_cpu_stop(void *data) { struct multi_stop_data *msdata = data; enum multi_stop_state curstate = MULTI_STOP_NONE; int cpu = smp_processor_id(), err = 0; unsigned long flags; bool is_active; /* * When called from stop_machine_from_inactive_cpu(), irq might * already be disabled. Save the state and restore it on exit. */ local_save_flags(flags); if (!msdata->active_cpus) is_active = cpu == cpumask_first(cpu_online_mask); else is_active = cpumask_test_cpu(cpu, msdata->active_cpus); /* Simple state machine */ do { /* Chill out and ensure we re-read multi_stop_state. */ cpu_relax_yield(); if (msdata->state != curstate) { curstate = msdata->state; switch (curstate) { case MULTI_STOP_DISABLE_IRQ: local_irq_disable(); hard_irq_disable(); break; case MULTI_STOP_RUN: if (is_active) err = msdata->fn(msdata->data); break; default: break; } ack_state(msdata); } else if (curstate > MULTI_STOP_PREPARE) { /* * At this stage all other CPUs we depend on must spin * in the same loop. Any reason for hard-lockup should * be detected and reported on their side. */ touch_nmi_watchdog(); } } while (curstate != MULTI_STOP_EXIT); local_irq_restore(flags); return err; } static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, int cpu2, struct cpu_stop_work *work2) { struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); DEFINE_WAKE_Q(wakeq); int err; retry: /* * The waking up of stopper threads has to happen in the same * scheduling context as the queueing. Otherwise, there is a * possibility of one of the above stoppers being woken up by another * CPU, and preempting us. This will cause us to not wake up the other * stopper forever. */ preempt_disable(); raw_spin_lock_irq(&stopper1->lock); raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING); if (!stopper1->enabled || !stopper2->enabled) { err = -ENOENT; goto unlock; } /* * Ensure that if we race with __stop_cpus() the stoppers won't get * queued up in reverse order leading to system deadlock. * * We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has * queued a work on cpu1 but not on cpu2, we hold both locks. * * It can be falsely true but it is safe to spin until it is cleared, * queue_stop_cpus_work() does everything under preempt_disable(). */ if (unlikely(stop_cpus_in_progress)) { err = -EDEADLK; goto unlock; } err = 0; __cpu_stop_queue_work(stopper1, work1, &wakeq); __cpu_stop_queue_work(stopper2, work2, &wakeq); unlock: raw_spin_unlock(&stopper2->lock); raw_spin_unlock_irq(&stopper1->lock); if (unlikely(err == -EDEADLK)) { preempt_enable(); while (stop_cpus_in_progress) cpu_relax(); goto retry; } wake_up_q(&wakeq); preempt_enable(); return err; } /** * stop_two_cpus - stops two cpus * @cpu1: the cpu to stop * @cpu2: the other cpu to stop * @fn: function to execute * @arg: argument to @fn * * Stops both the current and specified CPU and runs @fn on one of them. * * returns when both are completed. */ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg) { struct cpu_stop_done done; struct cpu_stop_work work1, work2; struct multi_stop_data msdata; msdata = (struct multi_stop_data){ .fn = fn, .data = arg, .num_threads = 2, .active_cpus = cpumask_of(cpu1), }; work1 = work2 = (struct cpu_stop_work){ .fn = multi_cpu_stop, .arg = &msdata, .done = &done }; cpu_stop_init_done(&done, 2); set_state(&msdata, MULTI_STOP_PREPARE); if (cpu1 > cpu2) swap(cpu1, cpu2); if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) return -ENOENT; wait_for_completion(&done.completion); return done.ret; } /** * stop_one_cpu_nowait - stop a cpu but don't wait for completion * @cpu: cpu to stop * @fn: function to execute * @arg: argument to @fn * @work_buf: pointer to cpu_stop_work structure * * Similar to stop_one_cpu() but doesn't wait for completion. The * caller is responsible for ensuring @work_buf is currently unused * and will remain untouched until stopper starts executing @fn. * * CONTEXT: * Don't care. * * RETURNS: * true if cpu_stop_work was queued successfully and @fn will be called, * false otherwise. */ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; return cpu_stop_queue_work(cpu, work_buf); } static bool queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, struct cpu_stop_done *done) { struct cpu_stop_work *work; unsigned int cpu; bool queued = false; /* * Disable preemption while queueing to avoid getting * preempted by a stopper which might wait for other stoppers * to enter @fn which can lead to deadlock. */ preempt_disable(); stop_cpus_in_progress = true; for_each_cpu(cpu, cpumask) { work = &per_cpu(cpu_stopper.stop_work, cpu); work->fn = fn; work->arg = arg; work->done = done; if (cpu_stop_queue_work(cpu, work)) queued = true; } stop_cpus_in_progress = false; preempt_enable(); return queued; } static int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) { struct cpu_stop_done done; cpu_stop_init_done(&done, cpumask_weight(cpumask)); if (!queue_stop_cpus_work(cpumask, fn, arg, &done)) return -ENOENT; wait_for_completion(&done.completion); return done.ret; } /** * stop_cpus - stop multiple cpus * @cpumask: cpus to stop * @fn: function to execute * @arg: argument to @fn * * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu, * @fn is run in a process context with the highest priority * preempting any task on the cpu and monopolizing it. This function * returns after all executions are complete. * * This function doesn't guarantee the cpus in @cpumask stay online * till @fn completes. If some cpus go down in the middle, execution * on the cpu may happen partially or fully on different cpus. @fn * should either be ready for that or the caller should ensure that * the cpus stay online until this function completes. * * All stop_cpus() calls are serialized making it safe for @fn to wait * for all cpus to start executing it. * * CONTEXT: * Might sleep. * * RETURNS: * -ENOENT if @fn(@arg) was not executed at all because all cpus in * @cpumask were offline; otherwise, 0 if all executions of @fn * returned 0, any non zero return value if any returned non zero. */ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) { int ret; /* static works are used, process one request at a time */ mutex_lock(&stop_cpus_mutex); ret = __stop_cpus(cpumask, fn, arg); mutex_unlock(&stop_cpus_mutex); return ret; } /** * try_stop_cpus - try to stop multiple cpus * @cpumask: cpus to stop * @fn: function to execute * @arg: argument to @fn * * Identical to stop_cpus() except that it fails with -EAGAIN if * someone else is already using the facility. * * CONTEXT: * Might sleep. * * RETURNS: * -EAGAIN if someone else is already stopping cpus, -ENOENT if * @fn(@arg) was not executed at all because all cpus in @cpumask were * offline; otherwise, 0 if all executions of @fn returned 0, any non * zero return value if any returned non zero. */ int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) { int ret; /* static works are used, process one request at a time */ if (!mutex_trylock(&stop_cpus_mutex)) return -EAGAIN; ret = __stop_cpus(cpumask, fn, arg); mutex_unlock(&stop_cpus_mutex); return ret; } static int cpu_stop_should_run(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); unsigned long flags; int run; raw_spin_lock_irqsave(&stopper->lock, flags); run = !list_empty(&stopper->works); raw_spin_unlock_irqrestore(&stopper->lock, flags); return run; } static void cpu_stopper_thread(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct cpu_stop_work *work; repeat: work = NULL; raw_spin_lock_irq(&stopper->lock); if (!list_empty(&stopper->works)) { work = list_first_entry(&stopper->works, struct cpu_stop_work, list); list_del_init(&work->list); } raw_spin_unlock_irq(&stopper->lock); if (work) { cpu_stop_fn_t fn = work->fn; void *arg = work->arg; struct cpu_stop_done *done = work->done; int ret; /* cpu stop callbacks must not sleep, make in_atomic() == T */ preempt_count_inc(); ret = fn(arg); if (done) { if (ret) done->ret = ret; cpu_stop_signal_done(done); } preempt_count_dec(); WARN_ONCE(preempt_count(), "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); goto repeat; } } void stop_machine_park(int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); /* * Lockless. cpu_stopper_thread() will take stopper->lock and flush * the pending works before it parks, until then it is fine to queue * the new works. */ stopper->enabled = false; kthread_park(stopper->thread); } extern void sched_set_stop_task(int cpu, struct task_struct *stop); static void cpu_stop_create(unsigned int cpu) { sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu)); } static void cpu_stop_park(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); WARN_ON(!list_empty(&stopper->works)); } void stop_machine_unpark(int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); stopper->enabled = true; kthread_unpark(stopper->thread); } static struct smp_hotplug_thread cpu_stop_threads = { .store = &cpu_stopper.thread, .thread_should_run = cpu_stop_should_run, .thread_fn = cpu_stopper_thread, .thread_comm = "migration/%u", .create = cpu_stop_create, .park = cpu_stop_park, .selfparking = true, }; static int __init cpu_stop_init(void) { unsigned int cpu; for_each_possible_cpu(cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); raw_spin_lock_init(&stopper->lock); INIT_LIST_HEAD(&stopper->works); } BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads)); stop_machine_unpark(raw_smp_processor_id()); stop_machine_initialized = true; return 0; } early_initcall(cpu_stop_init); int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, .data = data, .num_threads = num_online_cpus(), .active_cpus = cpus, }; lockdep_assert_cpus_held(); if (!stop_machine_initialized) { /* * Handle the case where stop_machine() is called * early in boot before stop_machine() has been * initialized. */ unsigned long flags; int ret; WARN_ON_ONCE(msdata.num_threads != 1); local_irq_save(flags); hard_irq_disable(); ret = (*fn)(data); local_irq_restore(flags); return ret; } /* Set the initial state and stop all online cpus. */ set_state(&msdata, MULTI_STOP_PREPARE); return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); } int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { int ret; /* No CPUs can come up or down during this. */ cpus_read_lock(); ret = stop_machine_cpuslocked(fn, data, cpus); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(stop_machine); /** * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU * @fn: the function to run * @data: the data ptr for the @fn() * @cpus: the cpus to run the @fn() on (NULL = any online cpu) * * This is identical to stop_machine() but can be called from a CPU which * is not active. The local CPU is in the process of hotplug (so no other * CPU hotplug can start) and not marked active and doesn't have enough * context to sleep. * * This function provides stop_machine() functionality for such state by * using busy-wait for synchronization and executing @fn directly for local * CPU. * * CONTEXT: * Local CPU is inactive. Temporarily stops all active CPUs. * * RETURNS: * 0 if all executions of @fn returned 0, any non zero return value if any * returned non zero. */ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { .fn = fn, .data = data, .active_cpus = cpus }; struct cpu_stop_done done; int ret; /* Local CPU must be inactive and CPU hotplug in progress. */ BUG_ON(cpu_active(raw_smp_processor_id())); msdata.num_threads = num_active_cpus() + 1; /* +1 for local */ /* No proper task established and can't sleep - busy wait for lock. */ while (!mutex_trylock(&stop_cpus_mutex)) cpu_relax(); /* Schedule work on other CPUs and execute directly for local CPU */ set_state(&msdata, MULTI_STOP_PREPARE); cpu_stop_init_done(&done, num_active_cpus()); queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata, &done); ret = multi_cpu_stop(&msdata); /* Busy wait for completion. */ while (!completion_done(&done.completion)) cpu_relax(); mutex_unlock(&stop_cpus_mutex); return ret ?: done.ret; } |
13 13 13 13 13 13 13 13 13 10 10 10 9 8 9 1 10 1 10 1 10 10 4 3 4 9 8 13 13 13 3 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 | /* * Cryptographic API. * * Whirlpool hashing Algorithm * * The Whirlpool algorithm was developed by Paulo S. L. M. Barreto and * Vincent Rijmen. It has been selected as one of cryptographic * primitives by the NESSIE project http://www.cryptonessie.org/ * * The original authors have disclaimed all copyright interest in this * code and thus put it in the public domain. The subsequent authors * have put this under the GNU General Public License. * * By Aaron Grothe ajgrothe@yahoo.com, August 23, 2004 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * */ #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <asm/byteorder.h> #include <linux/types.h> #define WP512_DIGEST_SIZE 64 #define WP384_DIGEST_SIZE 48 #define WP256_DIGEST_SIZE 32 #define WP512_BLOCK_SIZE 64 #define WP512_LENGTHBYTES 32 #define WHIRLPOOL_ROUNDS 10 struct wp512_ctx { u8 bitLength[WP512_LENGTHBYTES]; u8 buffer[WP512_BLOCK_SIZE]; int bufferBits; int bufferPos; u64 hash[WP512_DIGEST_SIZE/8]; }; /* * Though Whirlpool is endianness-neutral, the encryption tables are listed * in BIG-ENDIAN format, which is adopted throughout this implementation * (but little-endian notation would be equally suitable if consistently * employed). */ static const u64 C0[256] = { 0x18186018c07830d8ULL, 0x23238c2305af4626ULL, 0xc6c63fc67ef991b8ULL, 0xe8e887e8136fcdfbULL, 0x878726874ca113cbULL, 0xb8b8dab8a9626d11ULL, 0x0101040108050209ULL, 0x4f4f214f426e9e0dULL, 0x3636d836adee6c9bULL, 0xa6a6a2a6590451ffULL, 0xd2d26fd2debdb90cULL, 0xf5f5f3f5fb06f70eULL, 0x7979f979ef80f296ULL, 0x6f6fa16f5fcede30ULL, 0x91917e91fcef3f6dULL, 0x52525552aa07a4f8ULL, 0x60609d6027fdc047ULL, 0xbcbccabc89766535ULL, 0x9b9b569baccd2b37ULL, 0x8e8e028e048c018aULL, 0xa3a3b6a371155bd2ULL, 0x0c0c300c603c186cULL, 0x7b7bf17bff8af684ULL, 0x3535d435b5e16a80ULL, 0x1d1d741de8693af5ULL, 0xe0e0a7e05347ddb3ULL, 0xd7d77bd7f6acb321ULL, 0xc2c22fc25eed999cULL, 0x2e2eb82e6d965c43ULL, 0x4b4b314b627a9629ULL, 0xfefedffea321e15dULL, 0x575741578216aed5ULL, 0x15155415a8412abdULL, 0x7777c1779fb6eee8ULL, 0x3737dc37a5eb6e92ULL, 0xe5e5b3e57b56d79eULL, 0x9f9f469f8cd92313ULL, 0xf0f0e7f0d317fd23ULL, 0x4a4a354a6a7f9420ULL, 0xdada4fda9e95a944ULL, 0x58587d58fa25b0a2ULL, 0xc9c903c906ca8fcfULL, 0x2929a429558d527cULL, 0x0a0a280a5022145aULL, 0xb1b1feb1e14f7f50ULL, 0xa0a0baa0691a5dc9ULL, 0x6b6bb16b7fdad614ULL, 0x85852e855cab17d9ULL, 0xbdbdcebd8173673cULL, 0x5d5d695dd234ba8fULL, 0x1010401080502090ULL, 0xf4f4f7f4f303f507ULL, 0xcbcb0bcb16c08bddULL, 0x3e3ef83eedc67cd3ULL, 0x0505140528110a2dULL, 0x676781671fe6ce78ULL, 0xe4e4b7e47353d597ULL, 0x27279c2725bb4e02ULL, 0x4141194132588273ULL, 0x8b8b168b2c9d0ba7ULL, 0xa7a7a6a7510153f6ULL, 0x7d7de97dcf94fab2ULL, 0x95956e95dcfb3749ULL, 0xd8d847d88e9fad56ULL, 0xfbfbcbfb8b30eb70ULL, 0xeeee9fee2371c1cdULL, 0x7c7ced7cc791f8bbULL, 0x6666856617e3cc71ULL, 0xdddd53dda68ea77bULL, 0x17175c17b84b2eafULL, 0x4747014702468e45ULL, 0x9e9e429e84dc211aULL, 0xcaca0fca1ec589d4ULL, 0x2d2db42d75995a58ULL, 0xbfbfc6bf9179632eULL, 0x07071c07381b0e3fULL, 0xadad8ead012347acULL, 0x5a5a755aea2fb4b0ULL, 0x838336836cb51befULL, 0x3333cc3385ff66b6ULL, 0x636391633ff2c65cULL, 0x02020802100a0412ULL, 0xaaaa92aa39384993ULL, 0x7171d971afa8e2deULL, 0xc8c807c80ecf8dc6ULL, 0x19196419c87d32d1ULL, 0x494939497270923bULL, 0xd9d943d9869aaf5fULL, 0xf2f2eff2c31df931ULL, 0xe3e3abe34b48dba8ULL, 0x5b5b715be22ab6b9ULL, 0x88881a8834920dbcULL, 0x9a9a529aa4c8293eULL, 0x262698262dbe4c0bULL, 0x3232c8328dfa64bfULL, 0xb0b0fab0e94a7d59ULL, 0xe9e983e91b6acff2ULL, 0x0f0f3c0f78331e77ULL, 0xd5d573d5e6a6b733ULL, 0x80803a8074ba1df4ULL, 0xbebec2be997c6127ULL, 0xcdcd13cd26de87ebULL, 0x3434d034bde46889ULL, 0x48483d487a759032ULL, 0xffffdbffab24e354ULL, 0x7a7af57af78ff48dULL, 0x90907a90f4ea3d64ULL, 0x5f5f615fc23ebe9dULL, 0x202080201da0403dULL, 0x6868bd6867d5d00fULL, 0x1a1a681ad07234caULL, 0xaeae82ae192c41b7ULL, 0xb4b4eab4c95e757dULL, 0x54544d549a19a8ceULL, 0x93937693ece53b7fULL, 0x222288220daa442fULL, 0x64648d6407e9c863ULL, 0xf1f1e3f1db12ff2aULL, 0x7373d173bfa2e6ccULL, 0x12124812905a2482ULL, 0x40401d403a5d807aULL, 0x0808200840281048ULL, 0xc3c32bc356e89b95ULL, 0xecec97ec337bc5dfULL, 0xdbdb4bdb9690ab4dULL, 0xa1a1bea1611f5fc0ULL, 0x8d8d0e8d1c830791ULL, 0x3d3df43df5c97ac8ULL, 0x97976697ccf1335bULL, 0x0000000000000000ULL, 0xcfcf1bcf36d483f9ULL, 0x2b2bac2b4587566eULL, 0x7676c57697b3ece1ULL, 0x8282328264b019e6ULL, 0xd6d67fd6fea9b128ULL, 0x1b1b6c1bd87736c3ULL, 0xb5b5eeb5c15b7774ULL, 0xafaf86af112943beULL, 0x6a6ab56a77dfd41dULL, 0x50505d50ba0da0eaULL, 0x45450945124c8a57ULL, 0xf3f3ebf3cb18fb38ULL, 0x3030c0309df060adULL, 0xefef9bef2b74c3c4ULL, 0x3f3ffc3fe5c37edaULL, 0x55554955921caac7ULL, 0xa2a2b2a2791059dbULL, 0xeaea8fea0365c9e9ULL, 0x656589650fecca6aULL, 0xbabad2bab9686903ULL, 0x2f2fbc2f65935e4aULL, 0xc0c027c04ee79d8eULL, 0xdede5fdebe81a160ULL, 0x1c1c701ce06c38fcULL, 0xfdfdd3fdbb2ee746ULL, 0x4d4d294d52649a1fULL, 0x92927292e4e03976ULL, 0x7575c9758fbceafaULL, 0x06061806301e0c36ULL, 0x8a8a128a249809aeULL, 0xb2b2f2b2f940794bULL, 0xe6e6bfe66359d185ULL, 0x0e0e380e70361c7eULL, 0x1f1f7c1ff8633ee7ULL, 0x6262956237f7c455ULL, 0xd4d477d4eea3b53aULL, 0xa8a89aa829324d81ULL, 0x96966296c4f43152ULL, 0xf9f9c3f99b3aef62ULL, 0xc5c533c566f697a3ULL, 0x2525942535b14a10ULL, 0x59597959f220b2abULL, 0x84842a8454ae15d0ULL, 0x7272d572b7a7e4c5ULL, 0x3939e439d5dd72ecULL, 0x4c4c2d4c5a619816ULL, 0x5e5e655eca3bbc94ULL, 0x7878fd78e785f09fULL, 0x3838e038ddd870e5ULL, 0x8c8c0a8c14860598ULL, 0xd1d163d1c6b2bf17ULL, 0xa5a5aea5410b57e4ULL, 0xe2e2afe2434dd9a1ULL, 0x616199612ff8c24eULL, 0xb3b3f6b3f1457b42ULL, 0x2121842115a54234ULL, 0x9c9c4a9c94d62508ULL, 0x1e1e781ef0663ceeULL, 0x4343114322528661ULL, 0xc7c73bc776fc93b1ULL, 0xfcfcd7fcb32be54fULL, 0x0404100420140824ULL, 0x51515951b208a2e3ULL, 0x99995e99bcc72f25ULL, 0x6d6da96d4fc4da22ULL, 0x0d0d340d68391a65ULL, 0xfafacffa8335e979ULL, 0xdfdf5bdfb684a369ULL, 0x7e7ee57ed79bfca9ULL, 0x242490243db44819ULL, 0x3b3bec3bc5d776feULL, 0xabab96ab313d4b9aULL, 0xcece1fce3ed181f0ULL, 0x1111441188552299ULL, 0x8f8f068f0c890383ULL, 0x4e4e254e4a6b9c04ULL, 0xb7b7e6b7d1517366ULL, 0xebeb8beb0b60cbe0ULL, 0x3c3cf03cfdcc78c1ULL, 0x81813e817cbf1ffdULL, 0x94946a94d4fe3540ULL, 0xf7f7fbf7eb0cf31cULL, 0xb9b9deb9a1676f18ULL, 0x13134c13985f268bULL, 0x2c2cb02c7d9c5851ULL, 0xd3d36bd3d6b8bb05ULL, 0xe7e7bbe76b5cd38cULL, 0x6e6ea56e57cbdc39ULL, 0xc4c437c46ef395aaULL, 0x03030c03180f061bULL, 0x565645568a13acdcULL, 0x44440d441a49885eULL, 0x7f7fe17fdf9efea0ULL, 0xa9a99ea921374f88ULL, 0x2a2aa82a4d825467ULL, 0xbbbbd6bbb16d6b0aULL, 0xc1c123c146e29f87ULL, 0x53535153a202a6f1ULL, 0xdcdc57dcae8ba572ULL, 0x0b0b2c0b58271653ULL, 0x9d9d4e9d9cd32701ULL, 0x6c6cad6c47c1d82bULL, 0x3131c43195f562a4ULL, 0x7474cd7487b9e8f3ULL, 0xf6f6fff6e309f115ULL, 0x464605460a438c4cULL, 0xacac8aac092645a5ULL, 0x89891e893c970fb5ULL, 0x14145014a04428b4ULL, 0xe1e1a3e15b42dfbaULL, 0x16165816b04e2ca6ULL, 0x3a3ae83acdd274f7ULL, 0x6969b9696fd0d206ULL, 0x09092409482d1241ULL, 0x7070dd70a7ade0d7ULL, 0xb6b6e2b6d954716fULL, 0xd0d067d0ceb7bd1eULL, 0xeded93ed3b7ec7d6ULL, 0xcccc17cc2edb85e2ULL, 0x424215422a578468ULL, 0x98985a98b4c22d2cULL, 0xa4a4aaa4490e55edULL, 0x2828a0285d885075ULL, 0x5c5c6d5cda31b886ULL, 0xf8f8c7f8933fed6bULL, 0x8686228644a411c2ULL, }; static const u64 C1[256] = { 0xd818186018c07830ULL, 0x2623238c2305af46ULL, 0xb8c6c63fc67ef991ULL, 0xfbe8e887e8136fcdULL, 0xcb878726874ca113ULL, 0x11b8b8dab8a9626dULL, 0x0901010401080502ULL, 0x0d4f4f214f426e9eULL, 0x9b3636d836adee6cULL, 0xffa6a6a2a6590451ULL, 0x0cd2d26fd2debdb9ULL, 0x0ef5f5f3f5fb06f7ULL, 0x967979f979ef80f2ULL, 0x306f6fa16f5fcedeULL, 0x6d91917e91fcef3fULL, 0xf852525552aa07a4ULL, 0x4760609d6027fdc0ULL, 0x35bcbccabc897665ULL, 0x379b9b569baccd2bULL, 0x8a8e8e028e048c01ULL, 0xd2a3a3b6a371155bULL, 0x6c0c0c300c603c18ULL, 0x847b7bf17bff8af6ULL, 0x803535d435b5e16aULL, 0xf51d1d741de8693aULL, 0xb3e0e0a7e05347ddULL, 0x21d7d77bd7f6acb3ULL, 0x9cc2c22fc25eed99ULL, 0x432e2eb82e6d965cULL, 0x294b4b314b627a96ULL, 0x5dfefedffea321e1ULL, 0xd5575741578216aeULL, 0xbd15155415a8412aULL, 0xe87777c1779fb6eeULL, 0x923737dc37a5eb6eULL, 0x9ee5e5b3e57b56d7ULL, 0x139f9f469f8cd923ULL, 0x23f0f0e7f0d317fdULL, 0x204a4a354a6a7f94ULL, 0x44dada4fda9e95a9ULL, 0xa258587d58fa25b0ULL, 0xcfc9c903c906ca8fULL, 0x7c2929a429558d52ULL, 0x5a0a0a280a502214ULL, 0x50b1b1feb1e14f7fULL, 0xc9a0a0baa0691a5dULL, 0x146b6bb16b7fdad6ULL, 0xd985852e855cab17ULL, 0x3cbdbdcebd817367ULL, 0x8f5d5d695dd234baULL, 0x9010104010805020ULL, 0x07f4f4f7f4f303f5ULL, 0xddcbcb0bcb16c08bULL, 0xd33e3ef83eedc67cULL, 0x2d0505140528110aULL, 0x78676781671fe6ceULL, 0x97e4e4b7e47353d5ULL, 0x0227279c2725bb4eULL, 0x7341411941325882ULL, 0xa78b8b168b2c9d0bULL, 0xf6a7a7a6a7510153ULL, 0xb27d7de97dcf94faULL, 0x4995956e95dcfb37ULL, 0x56d8d847d88e9fadULL, 0x70fbfbcbfb8b30ebULL, 0xcdeeee9fee2371c1ULL, 0xbb7c7ced7cc791f8ULL, 0x716666856617e3ccULL, 0x7bdddd53dda68ea7ULL, 0xaf17175c17b84b2eULL, 0x454747014702468eULL, 0x1a9e9e429e84dc21ULL, 0xd4caca0fca1ec589ULL, 0x582d2db42d75995aULL, 0x2ebfbfc6bf917963ULL, 0x3f07071c07381b0eULL, 0xacadad8ead012347ULL, 0xb05a5a755aea2fb4ULL, 0xef838336836cb51bULL, 0xb63333cc3385ff66ULL, 0x5c636391633ff2c6ULL, 0x1202020802100a04ULL, 0x93aaaa92aa393849ULL, 0xde7171d971afa8e2ULL, 0xc6c8c807c80ecf8dULL, 0xd119196419c87d32ULL, 0x3b49493949727092ULL, 0x5fd9d943d9869aafULL, 0x31f2f2eff2c31df9ULL, 0xa8e3e3abe34b48dbULL, 0xb95b5b715be22ab6ULL, 0xbc88881a8834920dULL, 0x3e9a9a529aa4c829ULL, 0x0b262698262dbe4cULL, 0xbf3232c8328dfa64ULL, 0x59b0b0fab0e94a7dULL, 0xf2e9e983e91b6acfULL, 0x770f0f3c0f78331eULL, 0x33d5d573d5e6a6b7ULL, 0xf480803a8074ba1dULL, 0x27bebec2be997c61ULL, 0xebcdcd13cd26de87ULL, 0x893434d034bde468ULL, 0x3248483d487a7590ULL, 0x54ffffdbffab24e3ULL, 0x8d7a7af57af78ff4ULL, 0x6490907a90f4ea3dULL, 0x9d5f5f615fc23ebeULL, 0x3d202080201da040ULL, 0x0f6868bd6867d5d0ULL, 0xca1a1a681ad07234ULL, 0xb7aeae82ae192c41ULL, 0x7db4b4eab4c95e75ULL, 0xce54544d549a19a8ULL, 0x7f93937693ece53bULL, 0x2f222288220daa44ULL, 0x6364648d6407e9c8ULL, 0x2af1f1e3f1db12ffULL, 0xcc7373d173bfa2e6ULL, 0x8212124812905a24ULL, 0x7a40401d403a5d80ULL, 0x4808082008402810ULL, 0x95c3c32bc356e89bULL, 0xdfecec97ec337bc5ULL, 0x4ddbdb4bdb9690abULL, 0xc0a1a1bea1611f5fULL, 0x918d8d0e8d1c8307ULL, 0xc83d3df43df5c97aULL, 0x5b97976697ccf133ULL, 0x0000000000000000ULL, 0xf9cfcf1bcf36d483ULL, 0x6e2b2bac2b458756ULL, 0xe17676c57697b3ecULL, 0xe68282328264b019ULL, 0x28d6d67fd6fea9b1ULL, 0xc31b1b6c1bd87736ULL, 0x74b5b5eeb5c15b77ULL, 0xbeafaf86af112943ULL, 0x1d6a6ab56a77dfd4ULL, 0xea50505d50ba0da0ULL, 0x5745450945124c8aULL, 0x38f3f3ebf3cb18fbULL, 0xad3030c0309df060ULL, 0xc4efef9bef2b74c3ULL, 0xda3f3ffc3fe5c37eULL, 0xc755554955921caaULL, 0xdba2a2b2a2791059ULL, 0xe9eaea8fea0365c9ULL, 0x6a656589650feccaULL, 0x03babad2bab96869ULL, 0x4a2f2fbc2f65935eULL, 0x8ec0c027c04ee79dULL, 0x60dede5fdebe81a1ULL, 0xfc1c1c701ce06c38ULL, 0x46fdfdd3fdbb2ee7ULL, 0x1f4d4d294d52649aULL, 0x7692927292e4e039ULL, 0xfa7575c9758fbceaULL, 0x3606061806301e0cULL, 0xae8a8a128a249809ULL, 0x4bb2b2f2b2f94079ULL, 0x85e6e6bfe66359d1ULL, 0x7e0e0e380e70361cULL, 0xe71f1f7c1ff8633eULL, 0x556262956237f7c4ULL, 0x3ad4d477d4eea3b5ULL, 0x81a8a89aa829324dULL, 0x5296966296c4f431ULL, 0x62f9f9c3f99b3aefULL, 0xa3c5c533c566f697ULL, 0x102525942535b14aULL, 0xab59597959f220b2ULL, 0xd084842a8454ae15ULL, 0xc57272d572b7a7e4ULL, 0xec3939e439d5dd72ULL, 0x164c4c2d4c5a6198ULL, 0x945e5e655eca3bbcULL, 0x9f7878fd78e785f0ULL, 0xe53838e038ddd870ULL, 0x988c8c0a8c148605ULL, 0x17d1d163d1c6b2bfULL, 0xe4a5a5aea5410b57ULL, 0xa1e2e2afe2434dd9ULL, 0x4e616199612ff8c2ULL, 0x42b3b3f6b3f1457bULL, 0x342121842115a542ULL, 0x089c9c4a9c94d625ULL, 0xee1e1e781ef0663cULL, 0x6143431143225286ULL, 0xb1c7c73bc776fc93ULL, 0x4ffcfcd7fcb32be5ULL, 0x2404041004201408ULL, 0xe351515951b208a2ULL, 0x2599995e99bcc72fULL, 0x226d6da96d4fc4daULL, 0x650d0d340d68391aULL, 0x79fafacffa8335e9ULL, 0x69dfdf5bdfb684a3ULL, 0xa97e7ee57ed79bfcULL, 0x19242490243db448ULL, 0xfe3b3bec3bc5d776ULL, 0x9aabab96ab313d4bULL, 0xf0cece1fce3ed181ULL, 0x9911114411885522ULL, 0x838f8f068f0c8903ULL, 0x044e4e254e4a6b9cULL, 0x66b7b7e6b7d15173ULL, 0xe0ebeb8beb0b60cbULL, 0xc13c3cf03cfdcc78ULL, 0xfd81813e817cbf1fULL, 0x4094946a94d4fe35ULL, 0x1cf7f7fbf7eb0cf3ULL, 0x18b9b9deb9a1676fULL, 0x8b13134c13985f26ULL, 0x512c2cb02c7d9c58ULL, 0x05d3d36bd3d6b8bbULL, 0x8ce7e7bbe76b5cd3ULL, 0x396e6ea56e57cbdcULL, 0xaac4c437c46ef395ULL, 0x1b03030c03180f06ULL, 0xdc565645568a13acULL, 0x5e44440d441a4988ULL, 0xa07f7fe17fdf9efeULL, 0x88a9a99ea921374fULL, 0x672a2aa82a4d8254ULL, 0x0abbbbd6bbb16d6bULL, 0x87c1c123c146e29fULL, 0xf153535153a202a6ULL, 0x72dcdc57dcae8ba5ULL, 0x530b0b2c0b582716ULL, 0x019d9d4e9d9cd327ULL, 0x2b6c6cad6c47c1d8ULL, 0xa43131c43195f562ULL, 0xf37474cd7487b9e8ULL, 0x15f6f6fff6e309f1ULL, 0x4c464605460a438cULL, 0xa5acac8aac092645ULL, 0xb589891e893c970fULL, 0xb414145014a04428ULL, 0xbae1e1a3e15b42dfULL, 0xa616165816b04e2cULL, 0xf73a3ae83acdd274ULL, 0x066969b9696fd0d2ULL, 0x4109092409482d12ULL, 0xd77070dd70a7ade0ULL, 0x6fb6b6e2b6d95471ULL, 0x1ed0d067d0ceb7bdULL, 0xd6eded93ed3b7ec7ULL, 0xe2cccc17cc2edb85ULL, 0x68424215422a5784ULL, 0x2c98985a98b4c22dULL, 0xeda4a4aaa4490e55ULL, 0x752828a0285d8850ULL, 0x865c5c6d5cda31b8ULL, 0x6bf8f8c7f8933fedULL, 0xc28686228644a411ULL, }; static const u64 C2[256] = { 0x30d818186018c078ULL, 0x462623238c2305afULL, 0x91b8c6c63fc67ef9ULL, 0xcdfbe8e887e8136fULL, 0x13cb878726874ca1ULL, 0x6d11b8b8dab8a962ULL, 0x0209010104010805ULL, 0x9e0d4f4f214f426eULL, 0x6c9b3636d836adeeULL, 0x51ffa6a6a2a65904ULL, 0xb90cd2d26fd2debdULL, 0xf70ef5f5f3f5fb06ULL, 0xf2967979f979ef80ULL, 0xde306f6fa16f5fceULL, 0x3f6d91917e91fcefULL, 0xa4f852525552aa07ULL, 0xc04760609d6027fdULL, 0x6535bcbccabc8976ULL, 0x2b379b9b569baccdULL, 0x018a8e8e028e048cULL, 0x5bd2a3a3b6a37115ULL, 0x186c0c0c300c603cULL, 0xf6847b7bf17bff8aULL, 0x6a803535d435b5e1ULL, 0x3af51d1d741de869ULL, 0xddb3e0e0a7e05347ULL, 0xb321d7d77bd7f6acULL, 0x999cc2c22fc25eedULL, 0x5c432e2eb82e6d96ULL, 0x96294b4b314b627aULL, 0xe15dfefedffea321ULL, 0xaed5575741578216ULL, 0x2abd15155415a841ULL, 0xeee87777c1779fb6ULL, 0x6e923737dc37a5ebULL, 0xd79ee5e5b3e57b56ULL, 0x23139f9f469f8cd9ULL, 0xfd23f0f0e7f0d317ULL, 0x94204a4a354a6a7fULL, 0xa944dada4fda9e95ULL, 0xb0a258587d58fa25ULL, 0x8fcfc9c903c906caULL, 0x527c2929a429558dULL, 0x145a0a0a280a5022ULL, 0x7f50b1b1feb1e14fULL, 0x5dc9a0a0baa0691aULL, 0xd6146b6bb16b7fdaULL, 0x17d985852e855cabULL, 0x673cbdbdcebd8173ULL, 0xba8f5d5d695dd234ULL, 0x2090101040108050ULL, 0xf507f4f4f7f4f303ULL, 0x8bddcbcb0bcb16c0ULL, 0x7cd33e3ef83eedc6ULL, 0x0a2d050514052811ULL, 0xce78676781671fe6ULL, 0xd597e4e4b7e47353ULL, 0x4e0227279c2725bbULL, 0x8273414119413258ULL, 0x0ba78b8b168b2c9dULL, 0x53f6a7a7a6a75101ULL, 0xfab27d7de97dcf94ULL, 0x374995956e95dcfbULL, 0xad56d8d847d88e9fULL, 0xeb70fbfbcbfb8b30ULL, 0xc1cdeeee9fee2371ULL, 0xf8bb7c7ced7cc791ULL, 0xcc716666856617e3ULL, 0xa77bdddd53dda68eULL, 0x2eaf17175c17b84bULL, 0x8e45474701470246ULL, 0x211a9e9e429e84dcULL, 0x89d4caca0fca1ec5ULL, 0x5a582d2db42d7599ULL, 0x632ebfbfc6bf9179ULL, 0x0e3f07071c07381bULL, 0x47acadad8ead0123ULL, 0xb4b05a5a755aea2fULL, 0x1bef838336836cb5ULL, 0x66b63333cc3385ffULL, 0xc65c636391633ff2ULL, 0x041202020802100aULL, 0x4993aaaa92aa3938ULL, 0xe2de7171d971afa8ULL, 0x8dc6c8c807c80ecfULL, 0x32d119196419c87dULL, 0x923b494939497270ULL, 0xaf5fd9d943d9869aULL, 0xf931f2f2eff2c31dULL, 0xdba8e3e3abe34b48ULL, 0xb6b95b5b715be22aULL, 0x0dbc88881a883492ULL, 0x293e9a9a529aa4c8ULL, 0x4c0b262698262dbeULL, 0x64bf3232c8328dfaULL, 0x7d59b0b0fab0e94aULL, 0xcff2e9e983e91b6aULL, 0x1e770f0f3c0f7833ULL, 0xb733d5d573d5e6a6ULL, 0x1df480803a8074baULL, 0x6127bebec2be997cULL, 0x87ebcdcd13cd26deULL, 0x68893434d034bde4ULL, 0x903248483d487a75ULL, 0xe354ffffdbffab24ULL, 0xf48d7a7af57af78fULL, 0x3d6490907a90f4eaULL, 0xbe9d5f5f615fc23eULL, 0x403d202080201da0ULL, 0xd00f6868bd6867d5ULL, 0x34ca1a1a681ad072ULL, 0x41b7aeae82ae192cULL, 0x757db4b4eab4c95eULL, 0xa8ce54544d549a19ULL, 0x3b7f93937693ece5ULL, 0x442f222288220daaULL, 0xc86364648d6407e9ULL, 0xff2af1f1e3f1db12ULL, 0xe6cc7373d173bfa2ULL, 0x248212124812905aULL, 0x807a40401d403a5dULL, 0x1048080820084028ULL, 0x9b95c3c32bc356e8ULL, 0xc5dfecec97ec337bULL, 0xab4ddbdb4bdb9690ULL, 0x5fc0a1a1bea1611fULL, 0x07918d8d0e8d1c83ULL, 0x7ac83d3df43df5c9ULL, 0x335b97976697ccf1ULL, 0x0000000000000000ULL, 0x83f9cfcf1bcf36d4ULL, 0x566e2b2bac2b4587ULL, 0xece17676c57697b3ULL, 0x19e68282328264b0ULL, 0xb128d6d67fd6fea9ULL, 0x36c31b1b6c1bd877ULL, 0x7774b5b5eeb5c15bULL, 0x43beafaf86af1129ULL, 0xd41d6a6ab56a77dfULL, 0xa0ea50505d50ba0dULL, 0x8a5745450945124cULL, 0xfb38f3f3ebf3cb18ULL, 0x60ad3030c0309df0ULL, 0xc3c4efef9bef2b74ULL, 0x7eda3f3ffc3fe5c3ULL, 0xaac755554955921cULL, 0x59dba2a2b2a27910ULL, 0xc9e9eaea8fea0365ULL, 0xca6a656589650fecULL, 0x6903babad2bab968ULL, 0x5e4a2f2fbc2f6593ULL, 0x9d8ec0c027c04ee7ULL, 0xa160dede5fdebe81ULL, 0x38fc1c1c701ce06cULL, 0xe746fdfdd3fdbb2eULL, 0x9a1f4d4d294d5264ULL, 0x397692927292e4e0ULL, 0xeafa7575c9758fbcULL, 0x0c3606061806301eULL, 0x09ae8a8a128a2498ULL, 0x794bb2b2f2b2f940ULL, 0xd185e6e6bfe66359ULL, 0x1c7e0e0e380e7036ULL, 0x3ee71f1f7c1ff863ULL, 0xc4556262956237f7ULL, 0xb53ad4d477d4eea3ULL, 0x4d81a8a89aa82932ULL, 0x315296966296c4f4ULL, 0xef62f9f9c3f99b3aULL, 0x97a3c5c533c566f6ULL, 0x4a102525942535b1ULL, 0xb2ab59597959f220ULL, 0x15d084842a8454aeULL, 0xe4c57272d572b7a7ULL, 0x72ec3939e439d5ddULL, 0x98164c4c2d4c5a61ULL, 0xbc945e5e655eca3bULL, 0xf09f7878fd78e785ULL, 0x70e53838e038ddd8ULL, 0x05988c8c0a8c1486ULL, 0xbf17d1d163d1c6b2ULL, 0x57e4a5a5aea5410bULL, 0xd9a1e2e2afe2434dULL, 0xc24e616199612ff8ULL, 0x7b42b3b3f6b3f145ULL, 0x42342121842115a5ULL, 0x25089c9c4a9c94d6ULL, 0x3cee1e1e781ef066ULL, 0x8661434311432252ULL, 0x93b1c7c73bc776fcULL, 0xe54ffcfcd7fcb32bULL, 0x0824040410042014ULL, 0xa2e351515951b208ULL, 0x2f2599995e99bcc7ULL, 0xda226d6da96d4fc4ULL, 0x1a650d0d340d6839ULL, 0xe979fafacffa8335ULL, 0xa369dfdf5bdfb684ULL, 0xfca97e7ee57ed79bULL, 0x4819242490243db4ULL, 0x76fe3b3bec3bc5d7ULL, 0x4b9aabab96ab313dULL, 0x81f0cece1fce3ed1ULL, 0x2299111144118855ULL, 0x03838f8f068f0c89ULL, 0x9c044e4e254e4a6bULL, 0x7366b7b7e6b7d151ULL, 0xcbe0ebeb8beb0b60ULL, 0x78c13c3cf03cfdccULL, 0x1ffd81813e817cbfULL, 0x354094946a94d4feULL, 0xf31cf7f7fbf7eb0cULL, 0x6f18b9b9deb9a167ULL, 0x268b13134c13985fULL, 0x58512c2cb02c7d9cULL, 0xbb05d3d36bd3d6b8ULL, 0xd38ce7e7bbe76b5cULL, 0xdc396e6ea56e57cbULL, 0x95aac4c437c46ef3ULL, 0x061b03030c03180fULL, 0xacdc565645568a13ULL, 0x885e44440d441a49ULL, 0xfea07f7fe17fdf9eULL, 0x4f88a9a99ea92137ULL, 0x54672a2aa82a4d82ULL, 0x6b0abbbbd6bbb16dULL, 0x9f87c1c123c146e2ULL, 0xa6f153535153a202ULL, 0xa572dcdc57dcae8bULL, 0x16530b0b2c0b5827ULL, 0x27019d9d4e9d9cd3ULL, 0xd82b6c6cad6c47c1ULL, 0x62a43131c43195f5ULL, 0xe8f37474cd7487b9ULL, 0xf115f6f6fff6e309ULL, 0x8c4c464605460a43ULL, 0x45a5acac8aac0926ULL, 0x0fb589891e893c97ULL, 0x28b414145014a044ULL, 0xdfbae1e1a3e15b42ULL, 0x2ca616165816b04eULL, 0x74f73a3ae83acdd2ULL, 0xd2066969b9696fd0ULL, 0x124109092409482dULL, 0xe0d77070dd70a7adULL, 0x716fb6b6e2b6d954ULL, 0xbd1ed0d067d0ceb7ULL, 0xc7d6eded93ed3b7eULL, 0x85e2cccc17cc2edbULL, 0x8468424215422a57ULL, 0x2d2c98985a98b4c2ULL, 0x55eda4a4aaa4490eULL, 0x50752828a0285d88ULL, 0xb8865c5c6d5cda31ULL, 0xed6bf8f8c7f8933fULL, 0x11c28686228644a4ULL, }; static const u64 C3[256] = { 0x7830d818186018c0ULL, 0xaf462623238c2305ULL, 0xf991b8c6c63fc67eULL, 0x6fcdfbe8e887e813ULL, 0xa113cb878726874cULL, 0x626d11b8b8dab8a9ULL, 0x0502090101040108ULL, 0x6e9e0d4f4f214f42ULL, 0xee6c9b3636d836adULL, 0x0451ffa6a6a2a659ULL, 0xbdb90cd2d26fd2deULL, 0x06f70ef5f5f3f5fbULL, 0x80f2967979f979efULL, 0xcede306f6fa16f5fULL, 0xef3f6d91917e91fcULL, 0x07a4f852525552aaULL, 0xfdc04760609d6027ULL, 0x766535bcbccabc89ULL, 0xcd2b379b9b569bacULL, 0x8c018a8e8e028e04ULL, 0x155bd2a3a3b6a371ULL, 0x3c186c0c0c300c60ULL, 0x8af6847b7bf17bffULL, 0xe16a803535d435b5ULL, 0x693af51d1d741de8ULL, 0x47ddb3e0e0a7e053ULL, 0xacb321d7d77bd7f6ULL, 0xed999cc2c22fc25eULL, 0x965c432e2eb82e6dULL, 0x7a96294b4b314b62ULL, 0x21e15dfefedffea3ULL, 0x16aed55757415782ULL, 0x412abd15155415a8ULL, 0xb6eee87777c1779fULL, 0xeb6e923737dc37a5ULL, 0x56d79ee5e5b3e57bULL, 0xd923139f9f469f8cULL, 0x17fd23f0f0e7f0d3ULL, 0x7f94204a4a354a6aULL, 0x95a944dada4fda9eULL, 0x25b0a258587d58faULL, 0xca8fcfc9c903c906ULL, 0x8d527c2929a42955ULL, 0x22145a0a0a280a50ULL, 0x4f7f50b1b1feb1e1ULL, 0x1a5dc9a0a0baa069ULL, 0xdad6146b6bb16b7fULL, 0xab17d985852e855cULL, 0x73673cbdbdcebd81ULL, 0x34ba8f5d5d695dd2ULL, 0x5020901010401080ULL, 0x03f507f4f4f7f4f3ULL, 0xc08bddcbcb0bcb16ULL, 0xc67cd33e3ef83eedULL, 0x110a2d0505140528ULL, 0xe6ce78676781671fULL, 0x53d597e4e4b7e473ULL, 0xbb4e0227279c2725ULL, 0x5882734141194132ULL, 0x9d0ba78b8b168b2cULL, 0x0153f6a7a7a6a751ULL, 0x94fab27d7de97dcfULL, 0xfb374995956e95dcULL, 0x9fad56d8d847d88eULL, 0x30eb70fbfbcbfb8bULL, 0x71c1cdeeee9fee23ULL, 0x91f8bb7c7ced7cc7ULL, 0xe3cc716666856617ULL, 0x8ea77bdddd53dda6ULL, 0x4b2eaf17175c17b8ULL, 0x468e454747014702ULL, 0xdc211a9e9e429e84ULL, 0xc589d4caca0fca1eULL, 0x995a582d2db42d75ULL, 0x79632ebfbfc6bf91ULL, 0x1b0e3f07071c0738ULL, 0x2347acadad8ead01ULL, 0x2fb4b05a5a755aeaULL, 0xb51bef838336836cULL, 0xff66b63333cc3385ULL, 0xf2c65c636391633fULL, 0x0a04120202080210ULL, 0x384993aaaa92aa39ULL, 0xa8e2de7171d971afULL, 0xcf8dc6c8c807c80eULL, 0x7d32d119196419c8ULL, 0x70923b4949394972ULL, 0x9aaf5fd9d943d986ULL, 0x1df931f2f2eff2c3ULL, 0x48dba8e3e3abe34bULL, 0x2ab6b95b5b715be2ULL, 0x920dbc88881a8834ULL, 0xc8293e9a9a529aa4ULL, 0xbe4c0b262698262dULL, 0xfa64bf3232c8328dULL, 0x4a7d59b0b0fab0e9ULL, 0x6acff2e9e983e91bULL, 0x331e770f0f3c0f78ULL, 0xa6b733d5d573d5e6ULL, 0xba1df480803a8074ULL, 0x7c6127bebec2be99ULL, 0xde87ebcdcd13cd26ULL, 0xe468893434d034bdULL, 0x75903248483d487aULL, 0x24e354ffffdbffabULL, 0x8ff48d7a7af57af7ULL, 0xea3d6490907a90f4ULL, 0x3ebe9d5f5f615fc2ULL, 0xa0403d202080201dULL, 0xd5d00f6868bd6867ULL, 0x7234ca1a1a681ad0ULL, 0x2c41b7aeae82ae19ULL, 0x5e757db4b4eab4c9ULL, 0x19a8ce54544d549aULL, 0xe53b7f93937693ecULL, 0xaa442f222288220dULL, 0xe9c86364648d6407ULL, 0x12ff2af1f1e3f1dbULL, 0xa2e6cc7373d173bfULL, 0x5a24821212481290ULL, 0x5d807a40401d403aULL, 0x2810480808200840ULL, 0xe89b95c3c32bc356ULL, 0x7bc5dfecec97ec33ULL, 0x90ab4ddbdb4bdb96ULL, 0x1f5fc0a1a1bea161ULL, 0x8307918d8d0e8d1cULL, 0xc97ac83d3df43df5ULL, 0xf1335b97976697ccULL, 0x0000000000000000ULL, 0xd483f9cfcf1bcf36ULL, 0x87566e2b2bac2b45ULL, 0xb3ece17676c57697ULL, 0xb019e68282328264ULL, 0xa9b128d6d67fd6feULL, 0x7736c31b1b6c1bd8ULL, 0x5b7774b5b5eeb5c1ULL, 0x2943beafaf86af11ULL, 0xdfd41d6a6ab56a77ULL, 0x0da0ea50505d50baULL, 0x4c8a574545094512ULL, 0x18fb38f3f3ebf3cbULL, 0xf060ad3030c0309dULL, 0x74c3c4efef9bef2bULL, 0xc37eda3f3ffc3fe5ULL, 0x1caac75555495592ULL, 0x1059dba2a2b2a279ULL, 0x65c9e9eaea8fea03ULL, 0xecca6a656589650fULL, 0x686903babad2bab9ULL, 0x935e4a2f2fbc2f65ULL, 0xe79d8ec0c027c04eULL, 0x81a160dede5fdebeULL, 0x6c38fc1c1c701ce0ULL, 0x2ee746fdfdd3fdbbULL, 0x649a1f4d4d294d52ULL, 0xe0397692927292e4ULL, 0xbceafa7575c9758fULL, 0x1e0c360606180630ULL, 0x9809ae8a8a128a24ULL, 0x40794bb2b2f2b2f9ULL, 0x59d185e6e6bfe663ULL, 0x361c7e0e0e380e70ULL, 0x633ee71f1f7c1ff8ULL, 0xf7c4556262956237ULL, 0xa3b53ad4d477d4eeULL, 0x324d81a8a89aa829ULL, 0xf4315296966296c4ULL, 0x3aef62f9f9c3f99bULL, 0xf697a3c5c533c566ULL, 0xb14a102525942535ULL, 0x20b2ab59597959f2ULL, 0xae15d084842a8454ULL, 0xa7e4c57272d572b7ULL, 0xdd72ec3939e439d5ULL, 0x6198164c4c2d4c5aULL, 0x3bbc945e5e655ecaULL, 0x85f09f7878fd78e7ULL, 0xd870e53838e038ddULL, 0x8605988c8c0a8c14ULL, 0xb2bf17d1d163d1c6ULL, 0x0b57e4a5a5aea541ULL, 0x4dd9a1e2e2afe243ULL, 0xf8c24e616199612fULL, 0x457b42b3b3f6b3f1ULL, 0xa542342121842115ULL, 0xd625089c9c4a9c94ULL, 0x663cee1e1e781ef0ULL, 0x5286614343114322ULL, 0xfc93b1c7c73bc776ULL, 0x2be54ffcfcd7fcb3ULL, 0x1408240404100420ULL, 0x08a2e351515951b2ULL, 0xc72f2599995e99bcULL, 0xc4da226d6da96d4fULL, 0x391a650d0d340d68ULL, 0x35e979fafacffa83ULL, 0x84a369dfdf5bdfb6ULL, 0x9bfca97e7ee57ed7ULL, 0xb44819242490243dULL, 0xd776fe3b3bec3bc5ULL, 0x3d4b9aabab96ab31ULL, 0xd181f0cece1fce3eULL, 0x5522991111441188ULL, 0x8903838f8f068f0cULL, 0x6b9c044e4e254e4aULL, 0x517366b7b7e6b7d1ULL, 0x60cbe0ebeb8beb0bULL, 0xcc78c13c3cf03cfdULL, 0xbf1ffd81813e817cULL, 0xfe354094946a94d4ULL, 0x0cf31cf7f7fbf7ebULL, 0x676f18b9b9deb9a1ULL, 0x5f268b13134c1398ULL, 0x9c58512c2cb02c7dULL, 0xb8bb05d3d36bd3d6ULL, 0x5cd38ce7e7bbe76bULL, 0xcbdc396e6ea56e57ULL, 0xf395aac4c437c46eULL, 0x0f061b03030c0318ULL, 0x13acdc565645568aULL, 0x49885e44440d441aULL, 0x9efea07f7fe17fdfULL, 0x374f88a9a99ea921ULL, 0x8254672a2aa82a4dULL, 0x6d6b0abbbbd6bbb1ULL, 0xe29f87c1c123c146ULL, 0x02a6f153535153a2ULL, 0x8ba572dcdc57dcaeULL, 0x2716530b0b2c0b58ULL, 0xd327019d9d4e9d9cULL, 0xc1d82b6c6cad6c47ULL, 0xf562a43131c43195ULL, 0xb9e8f37474cd7487ULL, 0x09f115f6f6fff6e3ULL, 0x438c4c464605460aULL, 0x2645a5acac8aac09ULL, 0x970fb589891e893cULL, 0x4428b414145014a0ULL, 0x42dfbae1e1a3e15bULL, 0x4e2ca616165816b0ULL, 0xd274f73a3ae83acdULL, 0xd0d2066969b9696fULL, 0x2d12410909240948ULL, 0xade0d77070dd70a7ULL, 0x54716fb6b6e2b6d9ULL, 0xb7bd1ed0d067d0ceULL, 0x7ec7d6eded93ed3bULL, 0xdb85e2cccc17cc2eULL, 0x578468424215422aULL, 0xc22d2c98985a98b4ULL, 0x0e55eda4a4aaa449ULL, 0x8850752828a0285dULL, 0x31b8865c5c6d5cdaULL, 0x3fed6bf8f8c7f893ULL, 0xa411c28686228644ULL, }; static const u64 C4[256] = { 0xc07830d818186018ULL, 0x05af462623238c23ULL, 0x7ef991b8c6c63fc6ULL, 0x136fcdfbe8e887e8ULL, 0x4ca113cb87872687ULL, 0xa9626d11b8b8dab8ULL, 0x0805020901010401ULL, 0x426e9e0d4f4f214fULL, 0xadee6c9b3636d836ULL, 0x590451ffa6a6a2a6ULL, 0xdebdb90cd2d26fd2ULL, 0xfb06f70ef5f5f3f5ULL, 0xef80f2967979f979ULL, 0x5fcede306f6fa16fULL, 0xfcef3f6d91917e91ULL, 0xaa07a4f852525552ULL, 0x27fdc04760609d60ULL, 0x89766535bcbccabcULL, 0xaccd2b379b9b569bULL, 0x048c018a8e8e028eULL, 0x71155bd2a3a3b6a3ULL, 0x603c186c0c0c300cULL, 0xff8af6847b7bf17bULL, 0xb5e16a803535d435ULL, 0xe8693af51d1d741dULL, 0x5347ddb3e0e0a7e0ULL, 0xf6acb321d7d77bd7ULL, 0x5eed999cc2c22fc2ULL, 0x6d965c432e2eb82eULL, 0x627a96294b4b314bULL, 0xa321e15dfefedffeULL, 0x8216aed557574157ULL, 0xa8412abd15155415ULL, 0x9fb6eee87777c177ULL, 0xa5eb6e923737dc37ULL, 0x7b56d79ee5e5b3e5ULL, 0x8cd923139f9f469fULL, 0xd317fd23f0f0e7f0ULL, 0x6a7f94204a4a354aULL, 0x9e95a944dada4fdaULL, 0xfa25b0a258587d58ULL, 0x06ca8fcfc9c903c9ULL, 0x558d527c2929a429ULL, 0x5022145a0a0a280aULL, 0xe14f7f50b1b1feb1ULL, 0x691a5dc9a0a0baa0ULL, 0x7fdad6146b6bb16bULL, 0x5cab17d985852e85ULL, 0x8173673cbdbdcebdULL, 0xd234ba8f5d5d695dULL, 0x8050209010104010ULL, 0xf303f507f4f4f7f4ULL, 0x16c08bddcbcb0bcbULL, 0xedc67cd33e3ef83eULL, 0x28110a2d05051405ULL, 0x1fe6ce7867678167ULL, 0x7353d597e4e4b7e4ULL, 0x25bb4e0227279c27ULL, 0x3258827341411941ULL, 0x2c9d0ba78b8b168bULL, 0x510153f6a7a7a6a7ULL, 0xcf94fab27d7de97dULL, 0xdcfb374995956e95ULL, 0x8e9fad56d8d847d8ULL, 0x8b30eb70fbfbcbfbULL, 0x2371c1cdeeee9feeULL, 0xc791f8bb7c7ced7cULL, 0x17e3cc7166668566ULL, 0xa68ea77bdddd53ddULL, 0xb84b2eaf17175c17ULL, 0x02468e4547470147ULL, 0x84dc211a9e9e429eULL, 0x1ec589d4caca0fcaULL, 0x75995a582d2db42dULL, 0x9179632ebfbfc6bfULL, 0x381b0e3f07071c07ULL, 0x012347acadad8eadULL, 0xea2fb4b05a5a755aULL, 0x6cb51bef83833683ULL, 0x85ff66b63333cc33ULL, 0x3ff2c65c63639163ULL, 0x100a041202020802ULL, 0x39384993aaaa92aaULL, 0xafa8e2de7171d971ULL, 0x0ecf8dc6c8c807c8ULL, 0xc87d32d119196419ULL, 0x7270923b49493949ULL, 0x869aaf5fd9d943d9ULL, 0xc31df931f2f2eff2ULL, 0x4b48dba8e3e3abe3ULL, 0xe22ab6b95b5b715bULL, 0x34920dbc88881a88ULL, 0xa4c8293e9a9a529aULL, 0x2dbe4c0b26269826ULL, 0x8dfa64bf3232c832ULL, 0xe94a7d59b0b0fab0ULL, 0x1b6acff2e9e983e9ULL, 0x78331e770f0f3c0fULL, 0xe6a6b733d5d573d5ULL, 0x74ba1df480803a80ULL, 0x997c6127bebec2beULL, 0x26de87ebcdcd13cdULL, 0xbde468893434d034ULL, 0x7a75903248483d48ULL, 0xab24e354ffffdbffULL, 0xf78ff48d7a7af57aULL, 0xf4ea3d6490907a90ULL, 0xc23ebe9d5f5f615fULL, 0x1da0403d20208020ULL, 0x67d5d00f6868bd68ULL, 0xd07234ca1a1a681aULL, 0x192c41b7aeae82aeULL, 0xc95e757db4b4eab4ULL, 0x9a19a8ce54544d54ULL, 0xece53b7f93937693ULL, 0x0daa442f22228822ULL, 0x07e9c86364648d64ULL, 0xdb12ff2af1f1e3f1ULL, 0xbfa2e6cc7373d173ULL, 0x905a248212124812ULL, 0x3a5d807a40401d40ULL, 0x4028104808082008ULL, 0x56e89b95c3c32bc3ULL, 0x337bc5dfecec97ecULL, 0x9690ab4ddbdb4bdbULL, 0x611f5fc0a1a1bea1ULL, 0x1c8307918d8d0e8dULL, 0xf5c97ac83d3df43dULL, 0xccf1335b97976697ULL, 0x0000000000000000ULL, 0x36d483f9cfcf1bcfULL, 0x4587566e2b2bac2bULL, 0x97b3ece17676c576ULL, 0x64b019e682823282ULL, 0xfea9b128d6d67fd6ULL, 0xd87736c31b1b6c1bULL, 0xc15b7774b5b5eeb5ULL, 0x112943beafaf86afULL, 0x77dfd41d6a6ab56aULL, 0xba0da0ea50505d50ULL, 0x124c8a5745450945ULL, 0xcb18fb38f3f3ebf3ULL, 0x9df060ad3030c030ULL, 0x2b74c3c4efef9befULL, 0xe5c37eda3f3ffc3fULL, 0x921caac755554955ULL, 0x791059dba2a2b2a2ULL, 0x0365c9e9eaea8feaULL, 0x0fecca6a65658965ULL, 0xb9686903babad2baULL, 0x65935e4a2f2fbc2fULL, 0x4ee79d8ec0c027c0ULL, 0xbe81a160dede5fdeULL, 0xe06c38fc1c1c701cULL, 0xbb2ee746fdfdd3fdULL, 0x52649a1f4d4d294dULL, 0xe4e0397692927292ULL, 0x8fbceafa7575c975ULL, 0x301e0c3606061806ULL, 0x249809ae8a8a128aULL, 0xf940794bb2b2f2b2ULL, 0x6359d185e6e6bfe6ULL, 0x70361c7e0e0e380eULL, 0xf8633ee71f1f7c1fULL, 0x37f7c45562629562ULL, 0xeea3b53ad4d477d4ULL, 0x29324d81a8a89aa8ULL, 0xc4f4315296966296ULL, 0x9b3aef62f9f9c3f9ULL, 0x66f697a3c5c533c5ULL, 0x35b14a1025259425ULL, 0xf220b2ab59597959ULL, 0x54ae15d084842a84ULL, 0xb7a7e4c57272d572ULL, 0xd5dd72ec3939e439ULL, 0x5a6198164c4c2d4cULL, 0xca3bbc945e5e655eULL, 0xe785f09f7878fd78ULL, 0xddd870e53838e038ULL, 0x148605988c8c0a8cULL, 0xc6b2bf17d1d163d1ULL, 0x410b57e4a5a5aea5ULL, 0x434dd9a1e2e2afe2ULL, 0x2ff8c24e61619961ULL, 0xf1457b42b3b3f6b3ULL, 0x15a5423421218421ULL, 0x94d625089c9c4a9cULL, 0xf0663cee1e1e781eULL, 0x2252866143431143ULL, 0x76fc93b1c7c73bc7ULL, 0xb32be54ffcfcd7fcULL, 0x2014082404041004ULL, 0xb208a2e351515951ULL, 0xbcc72f2599995e99ULL, 0x4fc4da226d6da96dULL, 0x68391a650d0d340dULL, 0x8335e979fafacffaULL, 0xb684a369dfdf5bdfULL, 0xd79bfca97e7ee57eULL, 0x3db4481924249024ULL, 0xc5d776fe3b3bec3bULL, 0x313d4b9aabab96abULL, 0x3ed181f0cece1fceULL, 0x8855229911114411ULL, 0x0c8903838f8f068fULL, 0x4a6b9c044e4e254eULL, 0xd1517366b7b7e6b7ULL, 0x0b60cbe0ebeb8bebULL, 0xfdcc78c13c3cf03cULL, 0x7cbf1ffd81813e81ULL, 0xd4fe354094946a94ULL, 0xeb0cf31cf7f7fbf7ULL, 0xa1676f18b9b9deb9ULL, 0x985f268b13134c13ULL, 0x7d9c58512c2cb02cULL, 0xd6b8bb05d3d36bd3ULL, 0x6b5cd38ce7e7bbe7ULL, 0x57cbdc396e6ea56eULL, 0x6ef395aac4c437c4ULL, 0x180f061b03030c03ULL, 0x8a13acdc56564556ULL, 0x1a49885e44440d44ULL, 0xdf9efea07f7fe17fULL, 0x21374f88a9a99ea9ULL, 0x4d8254672a2aa82aULL, 0xb16d6b0abbbbd6bbULL, 0x46e29f87c1c123c1ULL, 0xa202a6f153535153ULL, 0xae8ba572dcdc57dcULL, 0x582716530b0b2c0bULL, 0x9cd327019d9d4e9dULL, 0x47c1d82b6c6cad6cULL, 0x95f562a43131c431ULL, 0x87b9e8f37474cd74ULL, 0xe309f115f6f6fff6ULL, 0x0a438c4c46460546ULL, 0x092645a5acac8aacULL, 0x3c970fb589891e89ULL, 0xa04428b414145014ULL, 0x5b42dfbae1e1a3e1ULL, 0xb04e2ca616165816ULL, 0xcdd274f73a3ae83aULL, 0x6fd0d2066969b969ULL, 0x482d124109092409ULL, 0xa7ade0d77070dd70ULL, 0xd954716fb6b6e2b6ULL, 0xceb7bd1ed0d067d0ULL, 0x3b7ec7d6eded93edULL, 0x2edb85e2cccc17ccULL, 0x2a57846842421542ULL, 0xb4c22d2c98985a98ULL, 0x490e55eda4a4aaa4ULL, 0x5d8850752828a028ULL, 0xda31b8865c5c6d5cULL, 0x933fed6bf8f8c7f8ULL, 0x44a411c286862286ULL, }; static const u64 C5[256] = { 0x18c07830d8181860ULL, 0x2305af462623238cULL, 0xc67ef991b8c6c63fULL, 0xe8136fcdfbe8e887ULL, 0x874ca113cb878726ULL, 0xb8a9626d11b8b8daULL, 0x0108050209010104ULL, 0x4f426e9e0d4f4f21ULL, 0x36adee6c9b3636d8ULL, 0xa6590451ffa6a6a2ULL, 0xd2debdb90cd2d26fULL, 0xf5fb06f70ef5f5f3ULL, 0x79ef80f2967979f9ULL, 0x6f5fcede306f6fa1ULL, 0x91fcef3f6d91917eULL, 0x52aa07a4f8525255ULL, 0x6027fdc04760609dULL, 0xbc89766535bcbccaULL, 0x9baccd2b379b9b56ULL, 0x8e048c018a8e8e02ULL, 0xa371155bd2a3a3b6ULL, 0x0c603c186c0c0c30ULL, 0x7bff8af6847b7bf1ULL, 0x35b5e16a803535d4ULL, 0x1de8693af51d1d74ULL, 0xe05347ddb3e0e0a7ULL, 0xd7f6acb321d7d77bULL, 0xc25eed999cc2c22fULL, 0x2e6d965c432e2eb8ULL, 0x4b627a96294b4b31ULL, 0xfea321e15dfefedfULL, 0x578216aed5575741ULL, 0x15a8412abd151554ULL, 0x779fb6eee87777c1ULL, 0x37a5eb6e923737dcULL, 0xe57b56d79ee5e5b3ULL, 0x9f8cd923139f9f46ULL, 0xf0d317fd23f0f0e7ULL, 0x4a6a7f94204a4a35ULL, 0xda9e95a944dada4fULL, 0x58fa25b0a258587dULL, 0xc906ca8fcfc9c903ULL, 0x29558d527c2929a4ULL, 0x0a5022145a0a0a28ULL, 0xb1e14f7f50b1b1feULL, 0xa0691a5dc9a0a0baULL, 0x6b7fdad6146b6bb1ULL, 0x855cab17d985852eULL, 0xbd8173673cbdbdceULL, 0x5dd234ba8f5d5d69ULL, 0x1080502090101040ULL, 0xf4f303f507f4f4f7ULL, 0xcb16c08bddcbcb0bULL, 0x3eedc67cd33e3ef8ULL, 0x0528110a2d050514ULL, 0x671fe6ce78676781ULL, 0xe47353d597e4e4b7ULL, 0x2725bb4e0227279cULL, 0x4132588273414119ULL, 0x8b2c9d0ba78b8b16ULL, 0xa7510153f6a7a7a6ULL, 0x7dcf94fab27d7de9ULL, 0x95dcfb374995956eULL, 0xd88e9fad56d8d847ULL, 0xfb8b30eb70fbfbcbULL, 0xee2371c1cdeeee9fULL, 0x7cc791f8bb7c7cedULL, 0x6617e3cc71666685ULL, 0xdda68ea77bdddd53ULL, 0x17b84b2eaf17175cULL, 0x4702468e45474701ULL, 0x9e84dc211a9e9e42ULL, 0xca1ec589d4caca0fULL, 0x2d75995a582d2db4ULL, 0xbf9179632ebfbfc6ULL, 0x07381b0e3f07071cULL, 0xad012347acadad8eULL, 0x5aea2fb4b05a5a75ULL, 0x836cb51bef838336ULL, 0x3385ff66b63333ccULL, 0x633ff2c65c636391ULL, 0x02100a0412020208ULL, 0xaa39384993aaaa92ULL, 0x71afa8e2de7171d9ULL, 0xc80ecf8dc6c8c807ULL, 0x19c87d32d1191964ULL, 0x497270923b494939ULL, 0xd9869aaf5fd9d943ULL, 0xf2c31df931f2f2efULL, 0xe34b48dba8e3e3abULL, 0x5be22ab6b95b5b71ULL, 0x8834920dbc88881aULL, 0x9aa4c8293e9a9a52ULL, 0x262dbe4c0b262698ULL, 0x328dfa64bf3232c8ULL, 0xb0e94a7d59b0b0faULL, 0xe91b6acff2e9e983ULL, 0x0f78331e770f0f3cULL, 0xd5e6a6b733d5d573ULL, 0x8074ba1df480803aULL, 0xbe997c6127bebec2ULL, 0xcd26de87ebcdcd13ULL, 0x34bde468893434d0ULL, 0x487a75903248483dULL, 0xffab24e354ffffdbULL, 0x7af78ff48d7a7af5ULL, 0x90f4ea3d6490907aULL, 0x5fc23ebe9d5f5f61ULL, 0x201da0403d202080ULL, 0x6867d5d00f6868bdULL, 0x1ad07234ca1a1a68ULL, 0xae192c41b7aeae82ULL, 0xb4c95e757db4b4eaULL, 0x549a19a8ce54544dULL, 0x93ece53b7f939376ULL, 0x220daa442f222288ULL, 0x6407e9c86364648dULL, 0xf1db12ff2af1f1e3ULL, 0x73bfa2e6cc7373d1ULL, 0x12905a2482121248ULL, 0x403a5d807a40401dULL, 0x0840281048080820ULL, 0xc356e89b95c3c32bULL, 0xec337bc5dfecec97ULL, 0xdb9690ab4ddbdb4bULL, 0xa1611f5fc0a1a1beULL, 0x8d1c8307918d8d0eULL, 0x3df5c97ac83d3df4ULL, 0x97ccf1335b979766ULL, 0x0000000000000000ULL, 0xcf36d483f9cfcf1bULL, 0x2b4587566e2b2bacULL, 0x7697b3ece17676c5ULL, 0x8264b019e6828232ULL, 0xd6fea9b128d6d67fULL, 0x1bd87736c31b1b6cULL, 0xb5c15b7774b5b5eeULL, 0xaf112943beafaf86ULL, 0x6a77dfd41d6a6ab5ULL, 0x50ba0da0ea50505dULL, 0x45124c8a57454509ULL, 0xf3cb18fb38f3f3ebULL, 0x309df060ad3030c0ULL, 0xef2b74c3c4efef9bULL, 0x3fe5c37eda3f3ffcULL, 0x55921caac7555549ULL, 0xa2791059dba2a2b2ULL, 0xea0365c9e9eaea8fULL, 0x650fecca6a656589ULL, 0xbab9686903babad2ULL, 0x2f65935e4a2f2fbcULL, 0xc04ee79d8ec0c027ULL, 0xdebe81a160dede5fULL, 0x1ce06c38fc1c1c70ULL, 0xfdbb2ee746fdfdd3ULL, 0x4d52649a1f4d4d29ULL, 0x92e4e03976929272ULL, 0x758fbceafa7575c9ULL, 0x06301e0c36060618ULL, 0x8a249809ae8a8a12ULL, 0xb2f940794bb2b2f2ULL, 0xe66359d185e6e6bfULL, 0x0e70361c7e0e0e38ULL, 0x1ff8633ee71f1f7cULL, 0x6237f7c455626295ULL, 0xd4eea3b53ad4d477ULL, 0xa829324d81a8a89aULL, 0x96c4f43152969662ULL, 0xf99b3aef62f9f9c3ULL, 0xc566f697a3c5c533ULL, 0x2535b14a10252594ULL, 0x59f220b2ab595979ULL, 0x8454ae15d084842aULL, 0x72b7a7e4c57272d5ULL, 0x39d5dd72ec3939e4ULL, 0x4c5a6198164c4c2dULL, 0x5eca3bbc945e5e65ULL, 0x78e785f09f7878fdULL, 0x38ddd870e53838e0ULL, 0x8c148605988c8c0aULL, 0xd1c6b2bf17d1d163ULL, 0xa5410b57e4a5a5aeULL, 0xe2434dd9a1e2e2afULL, 0x612ff8c24e616199ULL, 0xb3f1457b42b3b3f6ULL, 0x2115a54234212184ULL, 0x9c94d625089c9c4aULL, 0x1ef0663cee1e1e78ULL, 0x4322528661434311ULL, 0xc776fc93b1c7c73bULL, 0xfcb32be54ffcfcd7ULL, 0x0420140824040410ULL, 0x51b208a2e3515159ULL, 0x99bcc72f2599995eULL, 0x6d4fc4da226d6da9ULL, 0x0d68391a650d0d34ULL, 0xfa8335e979fafacfULL, 0xdfb684a369dfdf5bULL, 0x7ed79bfca97e7ee5ULL, 0x243db44819242490ULL, 0x3bc5d776fe3b3becULL, 0xab313d4b9aabab96ULL, 0xce3ed181f0cece1fULL, 0x1188552299111144ULL, 0x8f0c8903838f8f06ULL, 0x4e4a6b9c044e4e25ULL, 0xb7d1517366b7b7e6ULL, 0xeb0b60cbe0ebeb8bULL, 0x3cfdcc78c13c3cf0ULL, 0x817cbf1ffd81813eULL, 0x94d4fe354094946aULL, 0xf7eb0cf31cf7f7fbULL, 0xb9a1676f18b9b9deULL, 0x13985f268b13134cULL, 0x2c7d9c58512c2cb0ULL, 0xd3d6b8bb05d3d36bULL, 0xe76b5cd38ce7e7bbULL, 0x6e57cbdc396e6ea5ULL, 0xc46ef395aac4c437ULL, 0x03180f061b03030cULL, 0x568a13acdc565645ULL, 0x441a49885e44440dULL, 0x7fdf9efea07f7fe1ULL, 0xa921374f88a9a99eULL, 0x2a4d8254672a2aa8ULL, 0xbbb16d6b0abbbbd6ULL, 0xc146e29f87c1c123ULL, 0x53a202a6f1535351ULL, 0xdcae8ba572dcdc57ULL, 0x0b582716530b0b2cULL, 0x9d9cd327019d9d4eULL, 0x6c47c1d82b6c6cadULL, 0x3195f562a43131c4ULL, 0x7487b9e8f37474cdULL, 0xf6e309f115f6f6ffULL, 0x460a438c4c464605ULL, 0xac092645a5acac8aULL, 0x893c970fb589891eULL, 0x14a04428b4141450ULL, 0xe15b42dfbae1e1a3ULL, 0x16b04e2ca6161658ULL, 0x3acdd274f73a3ae8ULL, 0x696fd0d2066969b9ULL, 0x09482d1241090924ULL, 0x70a7ade0d77070ddULL, 0xb6d954716fb6b6e2ULL, 0xd0ceb7bd1ed0d067ULL, 0xed3b7ec7d6eded93ULL, 0xcc2edb85e2cccc17ULL, 0x422a578468424215ULL, 0x98b4c22d2c98985aULL, 0xa4490e55eda4a4aaULL, 0x285d8850752828a0ULL, 0x5cda31b8865c5c6dULL, 0xf8933fed6bf8f8c7ULL, 0x8644a411c2868622ULL, }; static const u64 C6[256] = { 0x6018c07830d81818ULL, 0x8c2305af46262323ULL, 0x3fc67ef991b8c6c6ULL, 0x87e8136fcdfbe8e8ULL, 0x26874ca113cb8787ULL, 0xdab8a9626d11b8b8ULL, 0x0401080502090101ULL, 0x214f426e9e0d4f4fULL, 0xd836adee6c9b3636ULL, 0xa2a6590451ffa6a6ULL, 0x6fd2debdb90cd2d2ULL, 0xf3f5fb06f70ef5f5ULL, 0xf979ef80f2967979ULL, 0xa16f5fcede306f6fULL, 0x7e91fcef3f6d9191ULL, 0x5552aa07a4f85252ULL, 0x9d6027fdc0476060ULL, 0xcabc89766535bcbcULL, 0x569baccd2b379b9bULL, 0x028e048c018a8e8eULL, 0xb6a371155bd2a3a3ULL, 0x300c603c186c0c0cULL, 0xf17bff8af6847b7bULL, 0xd435b5e16a803535ULL, 0x741de8693af51d1dULL, 0xa7e05347ddb3e0e0ULL, 0x7bd7f6acb321d7d7ULL, 0x2fc25eed999cc2c2ULL, 0xb82e6d965c432e2eULL, 0x314b627a96294b4bULL, 0xdffea321e15dfefeULL, 0x41578216aed55757ULL, 0x5415a8412abd1515ULL, 0xc1779fb6eee87777ULL, 0xdc37a5eb6e923737ULL, 0xb3e57b56d79ee5e5ULL, 0x469f8cd923139f9fULL, 0xe7f0d317fd23f0f0ULL, 0x354a6a7f94204a4aULL, 0x4fda9e95a944dadaULL, 0x7d58fa25b0a25858ULL, 0x03c906ca8fcfc9c9ULL, 0xa429558d527c2929ULL, 0x280a5022145a0a0aULL, 0xfeb1e14f7f50b1b1ULL, 0xbaa0691a5dc9a0a0ULL, 0xb16b7fdad6146b6bULL, 0x2e855cab17d98585ULL, 0xcebd8173673cbdbdULL, 0x695dd234ba8f5d5dULL, 0x4010805020901010ULL, 0xf7f4f303f507f4f4ULL, 0x0bcb16c08bddcbcbULL, 0xf83eedc67cd33e3eULL, 0x140528110a2d0505ULL, 0x81671fe6ce786767ULL, 0xb7e47353d597e4e4ULL, 0x9c2725bb4e022727ULL, 0x1941325882734141ULL, 0x168b2c9d0ba78b8bULL, 0xa6a7510153f6a7a7ULL, 0xe97dcf94fab27d7dULL, 0x6e95dcfb37499595ULL, 0x47d88e9fad56d8d8ULL, 0xcbfb8b30eb70fbfbULL, 0x9fee2371c1cdeeeeULL, 0xed7cc791f8bb7c7cULL, 0x856617e3cc716666ULL, 0x53dda68ea77bddddULL, 0x5c17b84b2eaf1717ULL, 0x014702468e454747ULL, 0x429e84dc211a9e9eULL, 0x0fca1ec589d4cacaULL, 0xb42d75995a582d2dULL, 0xc6bf9179632ebfbfULL, 0x1c07381b0e3f0707ULL, 0x8ead012347acadadULL, 0x755aea2fb4b05a5aULL, 0x36836cb51bef8383ULL, 0xcc3385ff66b63333ULL, 0x91633ff2c65c6363ULL, 0x0802100a04120202ULL, 0x92aa39384993aaaaULL, 0xd971afa8e2de7171ULL, 0x07c80ecf8dc6c8c8ULL, 0x6419c87d32d11919ULL, 0x39497270923b4949ULL, 0x43d9869aaf5fd9d9ULL, 0xeff2c31df931f2f2ULL, 0xabe34b48dba8e3e3ULL, 0x715be22ab6b95b5bULL, 0x1a8834920dbc8888ULL, 0x529aa4c8293e9a9aULL, 0x98262dbe4c0b2626ULL, 0xc8328dfa64bf3232ULL, 0xfab0e94a7d59b0b0ULL, 0x83e91b6acff2e9e9ULL, 0x3c0f78331e770f0fULL, 0x73d5e6a6b733d5d5ULL, 0x3a8074ba1df48080ULL, 0xc2be997c6127bebeULL, 0x13cd26de87ebcdcdULL, 0xd034bde468893434ULL, 0x3d487a7590324848ULL, 0xdbffab24e354ffffULL, 0xf57af78ff48d7a7aULL, 0x7a90f4ea3d649090ULL, 0x615fc23ebe9d5f5fULL, 0x80201da0403d2020ULL, 0xbd6867d5d00f6868ULL, 0x681ad07234ca1a1aULL, 0x82ae192c41b7aeaeULL, 0xeab4c95e757db4b4ULL, 0x4d549a19a8ce5454ULL, 0x7693ece53b7f9393ULL, 0x88220daa442f2222ULL, 0x8d6407e9c8636464ULL, 0xe3f1db12ff2af1f1ULL, 0xd173bfa2e6cc7373ULL, 0x4812905a24821212ULL, 0x1d403a5d807a4040ULL, 0x2008402810480808ULL, 0x2bc356e89b95c3c3ULL, 0x97ec337bc5dfececULL, 0x4bdb9690ab4ddbdbULL, 0xbea1611f5fc0a1a1ULL, 0x0e8d1c8307918d8dULL, 0xf43df5c97ac83d3dULL, 0x6697ccf1335b9797ULL, 0x0000000000000000ULL, 0x1bcf36d483f9cfcfULL, 0xac2b4587566e2b2bULL, 0xc57697b3ece17676ULL, 0x328264b019e68282ULL, 0x7fd6fea9b128d6d6ULL, 0x6c1bd87736c31b1bULL, 0xeeb5c15b7774b5b5ULL, 0x86af112943beafafULL, 0xb56a77dfd41d6a6aULL, 0x5d50ba0da0ea5050ULL, 0x0945124c8a574545ULL, 0xebf3cb18fb38f3f3ULL, 0xc0309df060ad3030ULL, 0x9bef2b74c3c4efefULL, 0xfc3fe5c37eda3f3fULL, 0x4955921caac75555ULL, 0xb2a2791059dba2a2ULL, 0x8fea0365c9e9eaeaULL, 0x89650fecca6a6565ULL, 0xd2bab9686903babaULL, 0xbc2f65935e4a2f2fULL, 0x27c04ee79d8ec0c0ULL, 0x5fdebe81a160dedeULL, 0x701ce06c38fc1c1cULL, 0xd3fdbb2ee746fdfdULL, 0x294d52649a1f4d4dULL, 0x7292e4e039769292ULL, 0xc9758fbceafa7575ULL, 0x1806301e0c360606ULL, 0x128a249809ae8a8aULL, 0xf2b2f940794bb2b2ULL, 0xbfe66359d185e6e6ULL, 0x380e70361c7e0e0eULL, 0x7c1ff8633ee71f1fULL, 0x956237f7c4556262ULL, 0x77d4eea3b53ad4d4ULL, 0x9aa829324d81a8a8ULL, 0x6296c4f431529696ULL, 0xc3f99b3aef62f9f9ULL, 0x33c566f697a3c5c5ULL, 0x942535b14a102525ULL, 0x7959f220b2ab5959ULL, 0x2a8454ae15d08484ULL, 0xd572b7a7e4c57272ULL, 0xe439d5dd72ec3939ULL, 0x2d4c5a6198164c4cULL, 0x655eca3bbc945e5eULL, 0xfd78e785f09f7878ULL, 0xe038ddd870e53838ULL, 0x0a8c148605988c8cULL, 0x63d1c6b2bf17d1d1ULL, 0xaea5410b57e4a5a5ULL, 0xafe2434dd9a1e2e2ULL, 0x99612ff8c24e6161ULL, 0xf6b3f1457b42b3b3ULL, 0x842115a542342121ULL, 0x4a9c94d625089c9cULL, 0x781ef0663cee1e1eULL, 0x1143225286614343ULL, 0x3bc776fc93b1c7c7ULL, 0xd7fcb32be54ffcfcULL, 0x1004201408240404ULL, 0x5951b208a2e35151ULL, 0x5e99bcc72f259999ULL, 0xa96d4fc4da226d6dULL, 0x340d68391a650d0dULL, 0xcffa8335e979fafaULL, 0x5bdfb684a369dfdfULL, 0xe57ed79bfca97e7eULL, 0x90243db448192424ULL, 0xec3bc5d776fe3b3bULL, 0x96ab313d4b9aababULL, 0x1fce3ed181f0ceceULL, 0x4411885522991111ULL, 0x068f0c8903838f8fULL, 0x254e4a6b9c044e4eULL, 0xe6b7d1517366b7b7ULL, 0x8beb0b60cbe0ebebULL, 0xf03cfdcc78c13c3cULL, 0x3e817cbf1ffd8181ULL, 0x6a94d4fe35409494ULL, 0xfbf7eb0cf31cf7f7ULL, 0xdeb9a1676f18b9b9ULL, 0x4c13985f268b1313ULL, 0xb02c7d9c58512c2cULL, 0x6bd3d6b8bb05d3d3ULL, 0xbbe76b5cd38ce7e7ULL, 0xa56e57cbdc396e6eULL, 0x37c46ef395aac4c4ULL, 0x0c03180f061b0303ULL, 0x45568a13acdc5656ULL, 0x0d441a49885e4444ULL, 0xe17fdf9efea07f7fULL, 0x9ea921374f88a9a9ULL, 0xa82a4d8254672a2aULL, 0xd6bbb16d6b0abbbbULL, 0x23c146e29f87c1c1ULL, 0x5153a202a6f15353ULL, 0x57dcae8ba572dcdcULL, 0x2c0b582716530b0bULL, 0x4e9d9cd327019d9dULL, 0xad6c47c1d82b6c6cULL, 0xc43195f562a43131ULL, 0xcd7487b9e8f37474ULL, 0xfff6e309f115f6f6ULL, 0x05460a438c4c4646ULL, 0x8aac092645a5acacULL, 0x1e893c970fb58989ULL, 0x5014a04428b41414ULL, 0xa3e15b42dfbae1e1ULL, 0x5816b04e2ca61616ULL, 0xe83acdd274f73a3aULL, 0xb9696fd0d2066969ULL, 0x2409482d12410909ULL, 0xdd70a7ade0d77070ULL, 0xe2b6d954716fb6b6ULL, 0x67d0ceb7bd1ed0d0ULL, 0x93ed3b7ec7d6ededULL, 0x17cc2edb85e2ccccULL, 0x15422a5784684242ULL, 0x5a98b4c22d2c9898ULL, 0xaaa4490e55eda4a4ULL, 0xa0285d8850752828ULL, 0x6d5cda31b8865c5cULL, 0xc7f8933fed6bf8f8ULL, 0x228644a411c28686ULL, }; static const u64 C7[256] = { 0x186018c07830d818ULL, 0x238c2305af462623ULL, 0xc63fc67ef991b8c6ULL, 0xe887e8136fcdfbe8ULL, 0x8726874ca113cb87ULL, 0xb8dab8a9626d11b8ULL, 0x0104010805020901ULL, 0x4f214f426e9e0d4fULL, 0x36d836adee6c9b36ULL, 0xa6a2a6590451ffa6ULL, 0xd26fd2debdb90cd2ULL, 0xf5f3f5fb06f70ef5ULL, 0x79f979ef80f29679ULL, 0x6fa16f5fcede306fULL, 0x917e91fcef3f6d91ULL, 0x525552aa07a4f852ULL, 0x609d6027fdc04760ULL, 0xbccabc89766535bcULL, 0x9b569baccd2b379bULL, 0x8e028e048c018a8eULL, 0xa3b6a371155bd2a3ULL, 0x0c300c603c186c0cULL, 0x7bf17bff8af6847bULL, 0x35d435b5e16a8035ULL, 0x1d741de8693af51dULL, 0xe0a7e05347ddb3e0ULL, 0xd77bd7f6acb321d7ULL, 0xc22fc25eed999cc2ULL, 0x2eb82e6d965c432eULL, 0x4b314b627a96294bULL, 0xfedffea321e15dfeULL, 0x5741578216aed557ULL, 0x155415a8412abd15ULL, 0x77c1779fb6eee877ULL, 0x37dc37a5eb6e9237ULL, 0xe5b3e57b56d79ee5ULL, 0x9f469f8cd923139fULL, 0xf0e7f0d317fd23f0ULL, 0x4a354a6a7f94204aULL, 0xda4fda9e95a944daULL, 0x587d58fa25b0a258ULL, 0xc903c906ca8fcfc9ULL, 0x29a429558d527c29ULL, 0x0a280a5022145a0aULL, 0xb1feb1e14f7f50b1ULL, 0xa0baa0691a5dc9a0ULL, 0x6bb16b7fdad6146bULL, 0x852e855cab17d985ULL, 0xbdcebd8173673cbdULL, 0x5d695dd234ba8f5dULL, 0x1040108050209010ULL, 0xf4f7f4f303f507f4ULL, 0xcb0bcb16c08bddcbULL, 0x3ef83eedc67cd33eULL, 0x05140528110a2d05ULL, 0x6781671fe6ce7867ULL, 0xe4b7e47353d597e4ULL, 0x279c2725bb4e0227ULL, 0x4119413258827341ULL, 0x8b168b2c9d0ba78bULL, 0xa7a6a7510153f6a7ULL, 0x7de97dcf94fab27dULL, 0x956e95dcfb374995ULL, 0xd847d88e9fad56d8ULL, 0xfbcbfb8b30eb70fbULL, 0xee9fee2371c1cdeeULL, 0x7ced7cc791f8bb7cULL, 0x66856617e3cc7166ULL, 0xdd53dda68ea77bddULL, 0x175c17b84b2eaf17ULL, 0x47014702468e4547ULL, 0x9e429e84dc211a9eULL, 0xca0fca1ec589d4caULL, 0x2db42d75995a582dULL, 0xbfc6bf9179632ebfULL, 0x071c07381b0e3f07ULL, 0xad8ead012347acadULL, 0x5a755aea2fb4b05aULL, 0x8336836cb51bef83ULL, 0x33cc3385ff66b633ULL, 0x6391633ff2c65c63ULL, 0x020802100a041202ULL, 0xaa92aa39384993aaULL, 0x71d971afa8e2de71ULL, 0xc807c80ecf8dc6c8ULL, 0x196419c87d32d119ULL, 0x4939497270923b49ULL, 0xd943d9869aaf5fd9ULL, 0xf2eff2c31df931f2ULL, 0xe3abe34b48dba8e3ULL, 0x5b715be22ab6b95bULL, 0x881a8834920dbc88ULL, 0x9a529aa4c8293e9aULL, 0x2698262dbe4c0b26ULL, 0x32c8328dfa64bf32ULL, 0xb0fab0e94a7d59b0ULL, 0xe983e91b6acff2e9ULL, 0x0f3c0f78331e770fULL, 0xd573d5e6a6b733d5ULL, 0x803a8074ba1df480ULL, 0xbec2be997c6127beULL, 0xcd13cd26de87ebcdULL, 0x34d034bde4688934ULL, 0x483d487a75903248ULL, 0xffdbffab24e354ffULL, 0x7af57af78ff48d7aULL, 0x907a90f4ea3d6490ULL, 0x5f615fc23ebe9d5fULL, 0x2080201da0403d20ULL, 0x68bd6867d5d00f68ULL, 0x1a681ad07234ca1aULL, 0xae82ae192c41b7aeULL, 0xb4eab4c95e757db4ULL, 0x544d549a19a8ce54ULL, 0x937693ece53b7f93ULL, 0x2288220daa442f22ULL, 0x648d6407e9c86364ULL, 0xf1e3f1db12ff2af1ULL, 0x73d173bfa2e6cc73ULL, 0x124812905a248212ULL, 0x401d403a5d807a40ULL, 0x0820084028104808ULL, 0xc32bc356e89b95c3ULL, 0xec97ec337bc5dfecULL, 0xdb4bdb9690ab4ddbULL, 0xa1bea1611f5fc0a1ULL, 0x8d0e8d1c8307918dULL, 0x3df43df5c97ac83dULL, 0x976697ccf1335b97ULL, 0x0000000000000000ULL, 0xcf1bcf36d483f9cfULL, 0x2bac2b4587566e2bULL, 0x76c57697b3ece176ULL, 0x82328264b019e682ULL, 0xd67fd6fea9b128d6ULL, 0x1b6c1bd87736c31bULL, 0xb5eeb5c15b7774b5ULL, 0xaf86af112943beafULL, 0x6ab56a77dfd41d6aULL, 0x505d50ba0da0ea50ULL, 0x450945124c8a5745ULL, 0xf3ebf3cb18fb38f3ULL, 0x30c0309df060ad30ULL, 0xef9bef2b74c3c4efULL, 0x3ffc3fe5c37eda3fULL, 0x554955921caac755ULL, 0xa2b2a2791059dba2ULL, 0xea8fea0365c9e9eaULL, 0x6589650fecca6a65ULL, 0xbad2bab9686903baULL, 0x2fbc2f65935e4a2fULL, 0xc027c04ee79d8ec0ULL, 0xde5fdebe81a160deULL, 0x1c701ce06c38fc1cULL, 0xfdd3fdbb2ee746fdULL, 0x4d294d52649a1f4dULL, 0x927292e4e0397692ULL, 0x75c9758fbceafa75ULL, 0x061806301e0c3606ULL, 0x8a128a249809ae8aULL, 0xb2f2b2f940794bb2ULL, 0xe6bfe66359d185e6ULL, 0x0e380e70361c7e0eULL, 0x1f7c1ff8633ee71fULL, 0x62956237f7c45562ULL, 0xd477d4eea3b53ad4ULL, 0xa89aa829324d81a8ULL, 0x966296c4f4315296ULL, 0xf9c3f99b3aef62f9ULL, 0xc533c566f697a3c5ULL, 0x25942535b14a1025ULL, 0x597959f220b2ab59ULL, 0x842a8454ae15d084ULL, 0x72d572b7a7e4c572ULL, 0x39e439d5dd72ec39ULL, 0x4c2d4c5a6198164cULL, 0x5e655eca3bbc945eULL, 0x78fd78e785f09f78ULL, 0x38e038ddd870e538ULL, 0x8c0a8c148605988cULL, 0xd163d1c6b2bf17d1ULL, 0xa5aea5410b57e4a5ULL, 0xe2afe2434dd9a1e2ULL, 0x6199612ff8c24e61ULL, 0xb3f6b3f1457b42b3ULL, 0x21842115a5423421ULL, 0x9c4a9c94d625089cULL, 0x1e781ef0663cee1eULL, 0x4311432252866143ULL, 0xc73bc776fc93b1c7ULL, 0xfcd7fcb32be54ffcULL, 0x0410042014082404ULL, 0x515951b208a2e351ULL, 0x995e99bcc72f2599ULL, 0x6da96d4fc4da226dULL, 0x0d340d68391a650dULL, 0xfacffa8335e979faULL, 0xdf5bdfb684a369dfULL, 0x7ee57ed79bfca97eULL, 0x2490243db4481924ULL, 0x3bec3bc5d776fe3bULL, 0xab96ab313d4b9aabULL, 0xce1fce3ed181f0ceULL, 0x1144118855229911ULL, 0x8f068f0c8903838fULL, 0x4e254e4a6b9c044eULL, 0xb7e6b7d1517366b7ULL, 0xeb8beb0b60cbe0ebULL, 0x3cf03cfdcc78c13cULL, 0x813e817cbf1ffd81ULL, 0x946a94d4fe354094ULL, 0xf7fbf7eb0cf31cf7ULL, 0xb9deb9a1676f18b9ULL, 0x134c13985f268b13ULL, 0x2cb02c7d9c58512cULL, 0xd36bd3d6b8bb05d3ULL, 0xe7bbe76b5cd38ce7ULL, 0x6ea56e57cbdc396eULL, 0xc437c46ef395aac4ULL, 0x030c03180f061b03ULL, 0x5645568a13acdc56ULL, 0x440d441a49885e44ULL, 0x7fe17fdf9efea07fULL, 0xa99ea921374f88a9ULL, 0x2aa82a4d8254672aULL, 0xbbd6bbb16d6b0abbULL, 0xc123c146e29f87c1ULL, 0x535153a202a6f153ULL, 0xdc57dcae8ba572dcULL, 0x0b2c0b582716530bULL, 0x9d4e9d9cd327019dULL, 0x6cad6c47c1d82b6cULL, 0x31c43195f562a431ULL, 0x74cd7487b9e8f374ULL, 0xf6fff6e309f115f6ULL, 0x4605460a438c4c46ULL, 0xac8aac092645a5acULL, 0x891e893c970fb589ULL, 0x145014a04428b414ULL, 0xe1a3e15b42dfbae1ULL, 0x165816b04e2ca616ULL, 0x3ae83acdd274f73aULL, 0x69b9696fd0d20669ULL, 0x092409482d124109ULL, 0x70dd70a7ade0d770ULL, 0xb6e2b6d954716fb6ULL, 0xd067d0ceb7bd1ed0ULL, 0xed93ed3b7ec7d6edULL, 0xcc17cc2edb85e2ccULL, 0x4215422a57846842ULL, 0x985a98b4c22d2c98ULL, 0xa4aaa4490e55eda4ULL, 0x28a0285d88507528ULL, 0x5c6d5cda31b8865cULL, 0xf8c7f8933fed6bf8ULL, 0x86228644a411c286ULL, }; static const u64 rc[WHIRLPOOL_ROUNDS] = { 0x1823c6e887b8014fULL, 0x36a6d2f5796f9152ULL, 0x60bc9b8ea30c7b35ULL, 0x1de0d7c22e4bfe57ULL, 0x157737e59ff04adaULL, 0x58c9290ab1a06b85ULL, 0xbd5d10f4cb3e0567ULL, 0xe427418ba77d95d8ULL, 0xfbee7c66dd17479eULL, 0xca2dbf07ad5a8333ULL, }; /** * The core Whirlpool transform. */ static void wp512_process_buffer(struct wp512_ctx *wctx) { int i, r; u64 K[8]; /* the round key */ u64 block[8]; /* mu(buffer) */ u64 state[8]; /* the cipher state */ u64 L[8]; const __be64 *buffer = (const __be64 *)wctx->buffer; for (i = 0; i < 8; i++) block[i] = be64_to_cpu(buffer[i]); state[0] = block[0] ^ (K[0] = wctx->hash[0]); state[1] = block[1] ^ (K[1] = wctx->hash[1]); state[2] = block[2] ^ (K[2] = wctx->hash[2]); state[3] = block[3] ^ (K[3] = wctx->hash[3]); state[4] = block[4] ^ (K[4] = wctx->hash[4]); state[5] = block[5] ^ (K[5] = wctx->hash[5]); state[6] = block[6] ^ (K[6] = wctx->hash[6]); state[7] = block[7] ^ (K[7] = wctx->hash[7]); for (r = 0; r < WHIRLPOOL_ROUNDS; r++) { L[0] = C0[(int)(K[0] >> 56) ] ^ C1[(int)(K[7] >> 48) & 0xff] ^ C2[(int)(K[6] >> 40) & 0xff] ^ C3[(int)(K[5] >> 32) & 0xff] ^ C4[(int)(K[4] >> 24) & 0xff] ^ C5[(int)(K[3] >> 16) & 0xff] ^ C6[(int)(K[2] >> 8) & 0xff] ^ C7[(int)(K[1] ) & 0xff] ^ rc[r]; L[1] = C0[(int)(K[1] >> 56) ] ^ C1[(int)(K[0] >> 48) & 0xff] ^ C2[(int)(K[7] >> 40) & 0xff] ^ C3[(int)(K[6] >> 32) & 0xff] ^ C4[(int)(K[5] >> 24) & 0xff] ^ C5[(int)(K[4] >> 16) & 0xff] ^ C6[(int)(K[3] >> 8) & 0xff] ^ C7[(int)(K[2] ) & 0xff]; L[2] = C0[(int)(K[2] >> 56) ] ^ C1[(int)(K[1] >> 48) & 0xff] ^ C2[(int)(K[0] >> 40) & 0xff] ^ C3[(int)(K[7] >> 32) & 0xff] ^ C4[(int)(K[6] >> 24) & 0xff] ^ C5[(int)(K[5] >> 16) & 0xff] ^ C6[(int)(K[4] >> 8) & 0xff] ^ C7[(int)(K[3] ) & 0xff]; L[3] = C0[(int)(K[3] >> 56) ] ^ C1[(int)(K[2] >> 48) & 0xff] ^ C2[(int)(K[1] >> 40) & 0xff] ^ C3[(int)(K[0] >> 32) & 0xff] ^ C4[(int)(K[7] >> 24) & 0xff] ^ C5[(int)(K[6] >> 16) & 0xff] ^ C6[(int)(K[5] >> 8) & 0xff] ^ C7[(int)(K[4] ) & 0xff]; L[4] = C0[(int)(K[4] >> 56) ] ^ C1[(int)(K[3] >> 48) & 0xff] ^ C2[(int)(K[2] >> 40) & 0xff] ^ C3[(int)(K[1] >> 32) & 0xff] ^ C4[(int)(K[0] >> 24) & 0xff] ^ C5[(int)(K[7] >> 16) & 0xff] ^ C6[(int)(K[6] >> 8) & 0xff] ^ C7[(int)(K[5] ) & 0xff]; L[5] = C0[(int)(K[5] >> 56) ] ^ C1[(int)(K[4] >> 48) & 0xff] ^ C2[(int)(K[3] >> 40) & 0xff] ^ C3[(int)(K[2] >> 32) & 0xff] ^ C4[(int)(K[1] >> 24) & 0xff] ^ C5[(int)(K[0] >> 16) & 0xff] ^ C6[(int)(K[7] >> 8) & 0xff] ^ C7[(int)(K[6] ) & 0xff]; L[6] = C0[(int)(K[6] >> 56) ] ^ C1[(int)(K[5] >> 48) & 0xff] ^ C2[(int)(K[4] >> 40) & 0xff] ^ C3[(int)(K[3] >> 32) & 0xff] ^ C4[(int)(K[2] >> 24) & 0xff] ^ C5[(int)(K[1] >> 16) & 0xff] ^ C6[(int)(K[0] >> 8) & 0xff] ^ C7[(int)(K[7] ) & 0xff]; L[7] = C0[(int)(K[7] >> 56) ] ^ C1[(int)(K[6] >> 48) & 0xff] ^ C2[(int)(K[5] >> 40) & 0xff] ^ C3[(int)(K[4] >> 32) & 0xff] ^ C4[(int)(K[3] >> 24) & 0xff] ^ C5[(int)(K[2] >> 16) & 0xff] ^ C6[(int)(K[1] >> 8) & 0xff] ^ C7[(int)(K[0] ) & 0xff]; K[0] = L[0]; K[1] = L[1]; K[2] = L[2]; K[3] = L[3]; K[4] = L[4]; K[5] = L[5]; K[6] = L[6]; K[7] = L[7]; L[0] = C0[(int)(state[0] >> 56) ] ^ C1[(int)(state[7] >> 48) & 0xff] ^ C2[(int)(state[6] >> 40) & 0xff] ^ C3[(int)(state[5] >> 32) & 0xff] ^ C4[(int)(state[4] >> 24) & 0xff] ^ C5[(int)(state[3] >> 16) & 0xff] ^ C6[(int)(state[2] >> 8) & 0xff] ^ C7[(int)(state[1] ) & 0xff] ^ K[0]; L[1] = C0[(int)(state[1] >> 56) ] ^ C1[(int)(state[0] >> 48) & 0xff] ^ C2[(int)(state[7] >> 40) & 0xff] ^ C3[(int)(state[6] >> 32) & 0xff] ^ C4[(int)(state[5] >> 24) & 0xff] ^ C5[(int)(state[4] >> 16) & 0xff] ^ C6[(int)(state[3] >> 8) & 0xff] ^ C7[(int)(state[2] ) & 0xff] ^ K[1]; L[2] = C0[(int)(state[2] >> 56) ] ^ C1[(int)(state[1] >> 48) & 0xff] ^ C2[(int)(state[0] >> 40) & 0xff] ^ C3[(int)(state[7] >> 32) & 0xff] ^ C4[(int)(state[6] >> 24) & 0xff] ^ C5[(int)(state[5] >> 16) & 0xff] ^ C6[(int)(state[4] >> 8) & 0xff] ^ C7[(int)(state[3] ) & 0xff] ^ K[2]; L[3] = C0[(int)(state[3] >> 56) ] ^ C1[(int)(state[2] >> 48) & 0xff] ^ C2[(int)(state[1] >> 40) & 0xff] ^ C3[(int)(state[0] >> 32) & 0xff] ^ C4[(int)(state[7] >> 24) & 0xff] ^ C5[(int)(state[6] >> 16) & 0xff] ^ C6[(int)(state[5] >> 8) & 0xff] ^ C7[(int)(state[4] ) & 0xff] ^ K[3]; L[4] = C0[(int)(state[4] >> 56) ] ^ C1[(int)(state[3] >> 48) & 0xff] ^ C2[(int)(state[2] >> 40) & 0xff] ^ C3[(int)(state[1] >> 32) & 0xff] ^ C4[(int)(state[0] >> 24) & 0xff] ^ C5[(int)(state[7] >> 16) & 0xff] ^ C6[(int)(state[6] >> 8) & 0xff] ^ C7[(int)(state[5] ) & 0xff] ^ K[4]; L[5] = C0[(int)(state[5] >> 56) ] ^ C1[(int)(state[4] >> 48) & 0xff] ^ C2[(int)(state[3] >> 40) & 0xff] ^ C3[(int)(state[2] >> 32) & 0xff] ^ C4[(int)(state[1] >> 24) & 0xff] ^ C5[(int)(state[0] >> 16) & 0xff] ^ C6[(int)(state[7] >> 8) & 0xff] ^ C7[(int)(state[6] ) & 0xff] ^ K[5]; L[6] = C0[(int)(state[6] >> 56) ] ^ C1[(int)(state[5] >> 48) & 0xff] ^ C2[(int)(state[4] >> 40) & 0xff] ^ C3[(int)(state[3] >> 32) & 0xff] ^ C4[(int)(state[2] >> 24) & 0xff] ^ C5[(int)(state[1] >> 16) & 0xff] ^ C6[(int)(state[0] >> 8) & 0xff] ^ C7[(int)(state[7] ) & 0xff] ^ K[6]; L[7] = C0[(int)(state[7] >> 56) ] ^ C1[(int)(state[6] >> 48) & 0xff] ^ C2[(int)(state[5] >> 40) & 0xff] ^ C3[(int)(state[4] >> 32) & 0xff] ^ C4[(int)(state[3] >> 24) & 0xff] ^ C5[(int)(state[2] >> 16) & 0xff] ^ C6[(int)(state[1] >> 8) & 0xff] ^ C7[(int)(state[0] ) & 0xff] ^ K[7]; state[0] = L[0]; state[1] = L[1]; state[2] = L[2]; state[3] = L[3]; state[4] = L[4]; state[5] = L[5]; state[6] = L[6]; state[7] = L[7]; } /* * apply the Miyaguchi-Preneel compression function: */ wctx->hash[0] ^= state[0] ^ block[0]; wctx->hash[1] ^= state[1] ^ block[1]; wctx->hash[2] ^= state[2] ^ block[2]; wctx->hash[3] ^= state[3] ^ block[3]; wctx->hash[4] ^= state[4] ^ block[4]; wctx->hash[5] ^= state[5] ^ block[5]; wctx->hash[6] ^= state[6] ^ block[6]; wctx->hash[7] ^= state[7] ^ block[7]; } static int wp512_init(struct shash_desc *desc) { struct wp512_ctx *wctx = shash_desc_ctx(desc); int i; memset(wctx->bitLength, 0, 32); wctx->bufferBits = wctx->bufferPos = 0; wctx->buffer[0] = 0; for (i = 0; i < 8; i++) { wctx->hash[i] = 0L; } return 0; } static int wp512_update(struct shash_desc *desc, const u8 *source, unsigned int len) { struct wp512_ctx *wctx = shash_desc_ctx(desc); int sourcePos = 0; unsigned int bits_len = len * 8; // convert to number of bits int sourceGap = (8 - ((int)bits_len & 7)) & 7; int bufferRem = wctx->bufferBits & 7; int i; u32 b, carry; u8 *buffer = wctx->buffer; u8 *bitLength = wctx->bitLength; int bufferBits = wctx->bufferBits; int bufferPos = wctx->bufferPos; u64 value = bits_len; for (i = 31, carry = 0; i >= 0 && (carry != 0 || value != 0ULL); i--) { carry += bitLength[i] + ((u32)value & 0xff); bitLength[i] = (u8)carry; carry >>= 8; value >>= 8; } while (bits_len > 8) { b = ((source[sourcePos] << sourceGap) & 0xff) | ((source[sourcePos + 1] & 0xff) >> (8 - sourceGap)); buffer[bufferPos++] |= (u8)(b >> bufferRem); bufferBits += 8 - bufferRem; if (bufferBits == WP512_BLOCK_SIZE * 8) { wp512_process_buffer(wctx); bufferBits = bufferPos = 0; } buffer[bufferPos] = b << (8 - bufferRem); bufferBits += bufferRem; bits_len -= 8; sourcePos++; } if (bits_len > 0) { b = (source[sourcePos] << sourceGap) & 0xff; buffer[bufferPos] |= b >> bufferRem; } else { b = 0; } if (bufferRem + bits_len < 8) { bufferBits += bits_len; } else { bufferPos++; bufferBits += 8 - bufferRem; bits_len -= 8 - bufferRem; if (bufferBits == WP512_BLOCK_SIZE * 8) { wp512_process_buffer(wctx); bufferBits = bufferPos = 0; } buffer[bufferPos] = b << (8 - bufferRem); bufferBits += (int)bits_len; } wctx->bufferBits = bufferBits; wctx->bufferPos = bufferPos; return 0; } static int wp512_final(struct shash_desc *desc, u8 *out) { struct wp512_ctx *wctx = shash_desc_ctx(desc); int i; u8 *buffer = wctx->buffer; u8 *bitLength = wctx->bitLength; int bufferBits = wctx->bufferBits; int bufferPos = wctx->bufferPos; __be64 *digest = (__be64 *)out; buffer[bufferPos] |= 0x80U >> (bufferBits & 7); bufferPos++; if (bufferPos > WP512_BLOCK_SIZE - WP512_LENGTHBYTES) { if (bufferPos < WP512_BLOCK_SIZE) { memset(&buffer[bufferPos], 0, WP512_BLOCK_SIZE - bufferPos); } wp512_process_buffer(wctx); bufferPos = 0; } if (bufferPos < WP512_BLOCK_SIZE - WP512_LENGTHBYTES) { memset(&buffer[bufferPos], 0, (WP512_BLOCK_SIZE - WP512_LENGTHBYTES) - bufferPos); } bufferPos = WP512_BLOCK_SIZE - WP512_LENGTHBYTES; memcpy(&buffer[WP512_BLOCK_SIZE - WP512_LENGTHBYTES], bitLength, WP512_LENGTHBYTES); wp512_process_buffer(wctx); for (i = 0; i < WP512_DIGEST_SIZE/8; i++) digest[i] = cpu_to_be64(wctx->hash[i]); wctx->bufferBits = bufferBits; wctx->bufferPos = bufferPos; return 0; } static int wp384_final(struct shash_desc *desc, u8 *out) { u8 D[64]; wp512_final(desc, D); memcpy(out, D, WP384_DIGEST_SIZE); memzero_explicit(D, WP512_DIGEST_SIZE); return 0; } static int wp256_final(struct shash_desc *desc, u8 *out) { u8 D[64]; wp512_final(desc, D); memcpy(out, D, WP256_DIGEST_SIZE); memzero_explicit(D, WP512_DIGEST_SIZE); return 0; } static struct shash_alg wp_algs[3] = { { .digestsize = WP512_DIGEST_SIZE, .init = wp512_init, .update = wp512_update, .final = wp512_final, .descsize = sizeof(struct wp512_ctx), .base = { .cra_name = "wp512", .cra_blocksize = WP512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = WP384_DIGEST_SIZE, .init = wp512_init, .update = wp512_update, .final = wp384_final, .descsize = sizeof(struct wp512_ctx), .base = { .cra_name = "wp384", .cra_blocksize = WP512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = WP256_DIGEST_SIZE, .init = wp512_init, .update = wp512_update, .final = wp256_final, .descsize = sizeof(struct wp512_ctx), .base = { .cra_name = "wp256", .cra_blocksize = WP512_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int __init wp512_mod_init(void) { return crypto_register_shashes(wp_algs, ARRAY_SIZE(wp_algs)); } static void __exit wp512_mod_fini(void) { crypto_unregister_shashes(wp_algs, ARRAY_SIZE(wp_algs)); } MODULE_ALIAS_CRYPTO("wp512"); MODULE_ALIAS_CRYPTO("wp384"); MODULE_ALIAS_CRYPTO("wp256"); module_init(wp512_mod_init); module_exit(wp512_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Whirlpool Message Digest Algorithm"); |
5 5 5 5 2 2 2 2 1 1 2 2 2 2 59 59 12 56 55 10 10 10 1 47 48 59 2 2 39 2 39 5 28 11 1 11 1 11 8 8 8 8 2 2 2 2 5 4 4 1 3 2 2 3 2 6 1 1 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 | /* BlueZ - Bluetooth protocol stack for Linux Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved. Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth HCI connection handling. */ #include <linux/export.h> #include <linux/debugfs.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include "hci_request.h" #include "smp.h" #include "a2mp.h" struct sco_param { u16 pkt_type; u16 max_latency; u8 retrans_effort; }; static const struct sco_param esco_param_cvsd[] = { { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a, 0x01 }, /* S3 */ { EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007, 0x01 }, /* S2 */ { EDR_ESCO_MASK | ESCO_EV3, 0x0007, 0x01 }, /* S1 */ { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0x01 }, /* D1 */ { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0x01 }, /* D0 */ }; static const struct sco_param sco_param_cvsd[] = { { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0xff }, /* D1 */ { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0xff }, /* D0 */ }; static const struct sco_param esco_param_msbc[] = { { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d, 0x02 }, /* T2 */ { EDR_ESCO_MASK | ESCO_EV3, 0x0008, 0x02 }, /* T1 */ }; /* This function requires the caller holds hdev->lock */ static void hci_connect_le_scan_cleanup(struct hci_conn *conn) { struct hci_conn_params *params; struct hci_dev *hdev = conn->hdev; struct smp_irk *irk; bdaddr_t *bdaddr; u8 bdaddr_type; bdaddr = &conn->dst; bdaddr_type = conn->dst_type; /* Check if we need to convert to identity address */ irk = hci_get_irk(hdev, bdaddr, bdaddr_type); if (irk) { bdaddr = &irk->bdaddr; bdaddr_type = irk->addr_type; } params = hci_pend_le_action_lookup(&hdev->pend_le_conns, bdaddr, bdaddr_type); if (!params || !params->explicit_connect) return; /* The connection attempt was doing scan for new RPA, and is * in scan phase. If params are not associated with any other * autoconnect action, remove them completely. If they are, just unmark * them as waiting for connection, by clearing explicit_connect field. */ params->explicit_connect = false; list_del_init(¶ms->action); switch (params->auto_connect) { case HCI_AUTO_CONN_EXPLICIT: hci_conn_params_del(hdev, bdaddr, bdaddr_type); /* return instead of break to avoid duplicate scan update */ return; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: list_add(¶ms->action, &hdev->pend_le_conns); break; case HCI_AUTO_CONN_REPORT: list_add(¶ms->action, &hdev->pend_le_reports); break; default: break; } hci_update_background_scan(hdev); } static void hci_conn_cleanup(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags)) hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type); hci_chan_list_flush(conn); hci_conn_hash_del(hdev, conn); if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); hci_conn_del_sysfs(conn); debugfs_remove_recursive(conn->debugfs); hci_dev_put(hdev); hci_conn_put(conn); } static void le_scan_cleanup(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, le_scan_cleanup); struct hci_dev *hdev = conn->hdev; struct hci_conn *c = NULL; BT_DBG("%s hcon %p", hdev->name, conn); hci_dev_lock(hdev); /* Check that the hci_conn is still around */ rcu_read_lock(); list_for_each_entry_rcu(c, &hdev->conn_hash.list, list) { if (c == conn) break; } rcu_read_unlock(); if (c == conn) { hci_connect_le_scan_cleanup(conn); hci_conn_cleanup(conn); } hci_dev_unlock(hdev); hci_dev_put(hdev); hci_conn_put(conn); } static void hci_connect_le_scan_remove(struct hci_conn *conn) { BT_DBG("%s hcon %p", conn->hdev->name, conn); /* We can't call hci_conn_del/hci_conn_cleanup here since that * could deadlock with another hci_conn_del() call that's holding * hci_dev_lock and doing cancel_delayed_work_sync(&conn->disc_work). * Instead, grab temporary extra references to the hci_dev and * hci_conn and perform the necessary cleanup in a separate work * callback. */ hci_dev_hold(conn->hdev); hci_conn_get(conn); /* Even though we hold a reference to the hdev, many other * things might get cleaned up meanwhile, including the hdev's * own workqueue, so we can't use that for scheduling. */ schedule_work(&conn->le_scan_cleanup); } static void hci_acl_create_connection(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; struct inquiry_entry *ie; struct hci_cp_create_conn cp; BT_DBG("hcon %p", conn); conn->state = BT_CONNECT; conn->out = true; conn->role = HCI_ROLE_MASTER; conn->attempt++; conn->link_policy = hdev->link_policy; memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, &conn->dst); cp.pscan_rep_mode = 0x02; ie = hci_inquiry_cache_lookup(hdev, &conn->dst); if (ie) { if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) { cp.pscan_rep_mode = ie->data.pscan_rep_mode; cp.pscan_mode = ie->data.pscan_mode; cp.clock_offset = ie->data.clock_offset | cpu_to_le16(0x8000); } memcpy(conn->dev_class, ie->data.dev_class, 3); if (ie->data.ssp_mode > 0) set_bit(HCI_CONN_SSP_ENABLED, &conn->flags); } cp.pkt_type = cpu_to_le16(conn->pkt_type); if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER)) cp.role_switch = 0x01; else cp.role_switch = 0x00; hci_send_cmd(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp); } int hci_disconnect(struct hci_conn *conn, __u8 reason) { BT_DBG("hcon %p", conn); /* When we are master of an established connection and it enters * the disconnect timeout, then go ahead and try to read the * current clock offset. Processing of the result is done * within the event handling and hci_clock_offset_evt function. */ if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER && (conn->state == BT_CONNECTED || conn->state == BT_CONFIG)) { struct hci_dev *hdev = conn->hdev; struct hci_cp_read_clock_offset clkoff_cp; clkoff_cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(hdev, HCI_OP_READ_CLOCK_OFFSET, sizeof(clkoff_cp), &clkoff_cp); } return hci_abort_conn(conn, reason); } static void hci_add_sco(struct hci_conn *conn, __u16 handle) { struct hci_dev *hdev = conn->hdev; struct hci_cp_add_sco cp; BT_DBG("hcon %p", conn); conn->state = BT_CONNECT; conn->out = true; conn->attempt++; cp.handle = cpu_to_le16(handle); cp.pkt_type = cpu_to_le16(conn->pkt_type); hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp); } bool hci_setup_sync(struct hci_conn *conn, __u16 handle) { struct hci_dev *hdev = conn->hdev; struct hci_cp_setup_sync_conn cp; const struct sco_param *param; BT_DBG("hcon %p", conn); conn->state = BT_CONNECT; conn->out = true; conn->attempt++; cp.handle = cpu_to_le16(handle); cp.tx_bandwidth = cpu_to_le32(0x00001f40); cp.rx_bandwidth = cpu_to_le32(0x00001f40); cp.voice_setting = cpu_to_le16(conn->setting); switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: if (conn->attempt > ARRAY_SIZE(esco_param_msbc)) return false; param = &esco_param_msbc[conn->attempt - 1]; break; case SCO_AIRMODE_CVSD: if (lmp_esco_capable(conn->link)) { if (conn->attempt > ARRAY_SIZE(esco_param_cvsd)) return false; param = &esco_param_cvsd[conn->attempt - 1]; } else { if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) return false; param = &sco_param_cvsd[conn->attempt - 1]; } break; default: return false; } cp.retrans_effort = param->retrans_effort; cp.pkt_type = __cpu_to_le16(param->pkt_type); cp.max_latency = __cpu_to_le16(param->max_latency); if (hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0) return false; return true; } u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier) { struct hci_dev *hdev = conn->hdev; struct hci_conn_params *params; struct hci_cp_le_conn_update cp; hci_dev_lock(hdev); params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); if (params) { params->conn_min_interval = min; params->conn_max_interval = max; params->conn_latency = latency; params->supervision_timeout = to_multiplier; } hci_dev_unlock(hdev); memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.conn_interval_min = cpu_to_le16(min); cp.conn_interval_max = cpu_to_le16(max); cp.conn_latency = cpu_to_le16(latency); cp.supervision_timeout = cpu_to_le16(to_multiplier); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp); if (params) return 0x01; return 0x00; } void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, __u8 ltk[16], __u8 key_size) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_start_enc cp; BT_DBG("hcon %p", conn); memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.rand = rand; cp.ediv = ediv; memcpy(cp.ltk, ltk, key_size); hci_send_cmd(hdev, HCI_OP_LE_START_ENC, sizeof(cp), &cp); } /* Device _must_ be locked */ void hci_sco_setup(struct hci_conn *conn, __u8 status) { struct hci_conn *sco = conn->link; if (!sco) return; BT_DBG("hcon %p", conn); if (!status) { if (lmp_esco_capable(conn->hdev)) hci_setup_sync(sco, conn->handle); else hci_add_sco(sco, conn->handle); } else { hci_connect_cfm(sco, status); hci_conn_del(sco); } } static void hci_conn_timeout(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, disc_work.work); int refcnt = atomic_read(&conn->refcnt); BT_DBG("hcon %p state %s", conn, state_to_string(conn->state)); WARN_ON(refcnt < 0); /* FIXME: It was observed that in pairing failed scenario, refcnt * drops below 0. Probably this is because l2cap_conn_del calls * l2cap_chan_del for each channel, and inside l2cap_chan_del conn is * dropped. After that loop hci_chan_del is called which also drops * conn. For now make sure that ACL is alive if refcnt is higher then 0, * otherwise drop it. */ if (refcnt > 0) return; /* LE connections in scanning state need special handling */ if (conn->state == BT_CONNECT && conn->type == LE_LINK && test_bit(HCI_CONN_SCANNING, &conn->flags)) { hci_connect_le_scan_remove(conn); return; } hci_abort_conn(conn, hci_proto_disconn_ind(conn)); } /* Enter sniff mode */ static void hci_conn_idle(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, idle_work.work); struct hci_dev *hdev = conn->hdev; BT_DBG("hcon %p mode %d", conn, conn->mode); if (!lmp_sniff_capable(hdev) || !lmp_sniff_capable(conn)) return; if (conn->mode != HCI_CM_ACTIVE || !(conn->link_policy & HCI_LP_SNIFF)) return; if (lmp_sniffsubr_capable(hdev) && lmp_sniffsubr_capable(conn)) { struct hci_cp_sniff_subrate cp; cp.handle = cpu_to_le16(conn->handle); cp.max_latency = cpu_to_le16(0); cp.min_remote_timeout = cpu_to_le16(0); cp.min_local_timeout = cpu_to_le16(0); hci_send_cmd(hdev, HCI_OP_SNIFF_SUBRATE, sizeof(cp), &cp); } if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags)) { struct hci_cp_sniff_mode cp; cp.handle = cpu_to_le16(conn->handle); cp.max_interval = cpu_to_le16(hdev->sniff_max_interval); cp.min_interval = cpu_to_le16(hdev->sniff_min_interval); cp.attempt = cpu_to_le16(4); cp.timeout = cpu_to_le16(1); hci_send_cmd(hdev, HCI_OP_SNIFF_MODE, sizeof(cp), &cp); } } static void hci_conn_auto_accept(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, auto_accept_work.work); hci_send_cmd(conn->hdev, HCI_OP_USER_CONFIRM_REPLY, sizeof(conn->dst), &conn->dst); } static void le_conn_timeout(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, le_conn_timeout.work); struct hci_dev *hdev = conn->hdev; BT_DBG(""); /* We could end up here due to having done directed advertising, * so clean up the state if necessary. This should however only * happen with broken hardware or if low duty cycle was used * (which doesn't have a timeout of its own). */ if (conn->role == HCI_ROLE_SLAVE) { u8 enable = 0x00; hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); hci_le_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT); return; } hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); } struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role) { struct hci_conn *conn; BT_DBG("%s dst %pMR", hdev->name, dst); conn = kzalloc(sizeof(*conn), GFP_KERNEL); if (!conn) return NULL; bacpy(&conn->dst, dst); bacpy(&conn->src, &hdev->bdaddr); conn->hdev = hdev; conn->type = type; conn->role = role; conn->mode = HCI_CM_ACTIVE; conn->state = BT_OPEN; conn->auth_type = HCI_AT_GENERAL_BONDING; conn->io_capability = hdev->io_capability; conn->remote_auth = 0xff; conn->key_type = 0xff; conn->rssi = HCI_RSSI_INVALID; conn->tx_power = HCI_TX_POWER_INVALID; conn->max_tx_power = HCI_TX_POWER_INVALID; set_bit(HCI_CONN_POWER_SAVE, &conn->flags); conn->disc_timeout = HCI_DISCONN_TIMEOUT; if (conn->role == HCI_ROLE_MASTER) conn->out = true; switch (type) { case ACL_LINK: conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK; break; case LE_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); break; case SCO_LINK: if (lmp_esco_capable(hdev)) conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) | (hdev->esco_type & EDR_ESCO_MASK); else conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK; break; case ESCO_LINK: conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK; break; } skb_queue_head_init(&conn->data_q); INIT_LIST_HEAD(&conn->chan_list); INIT_DELAYED_WORK(&conn->disc_work, hci_conn_timeout); INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept); INIT_DELAYED_WORK(&conn->idle_work, hci_conn_idle); INIT_DELAYED_WORK(&conn->le_conn_timeout, le_conn_timeout); INIT_WORK(&conn->le_scan_cleanup, le_scan_cleanup); atomic_set(&conn->refcnt, 0); hci_dev_hold(hdev); hci_conn_hash_add(hdev, conn); if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); hci_conn_init_sysfs(conn); return conn; } int hci_conn_del(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; BT_DBG("%s hcon %p handle %d", hdev->name, conn, conn->handle); cancel_delayed_work_sync(&conn->disc_work); cancel_delayed_work_sync(&conn->auto_accept_work); cancel_delayed_work_sync(&conn->idle_work); if (conn->type == ACL_LINK) { struct hci_conn *sco = conn->link; if (sco) sco->link = NULL; /* Unacked frames */ hdev->acl_cnt += conn->sent; } else if (conn->type == LE_LINK) { cancel_delayed_work(&conn->le_conn_timeout); if (hdev->le_pkts) hdev->le_cnt += conn->sent; else hdev->acl_cnt += conn->sent; } else { struct hci_conn *acl = conn->link; if (acl) { acl->link = NULL; hci_conn_drop(acl); } } if (conn->amp_mgr) amp_mgr_put(conn->amp_mgr); skb_queue_purge(&conn->data_q); /* Remove the connection from the list and cleanup its remaining * state. This is a separate function since for some cases like * BT_CONNECT_SCAN we *only* want the cleanup part without the * rest of hci_conn_del. */ hci_conn_cleanup(conn); return 0; } struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type) { int use_src = bacmp(src, BDADDR_ANY); struct hci_dev *hdev = NULL, *d; BT_DBG("%pMR -> %pMR", src, dst); read_lock(&hci_dev_list_lock); list_for_each_entry(d, &hci_dev_list, list) { if (!test_bit(HCI_UP, &d->flags) || hci_dev_test_flag(d, HCI_USER_CHANNEL) || d->dev_type != HCI_PRIMARY) continue; /* Simple routing: * No source address - find interface with bdaddr != dst * Source address - find interface with bdaddr == src */ if (use_src) { bdaddr_t id_addr; u8 id_addr_type; if (src_type == BDADDR_BREDR) { if (!lmp_bredr_capable(d)) continue; bacpy(&id_addr, &d->bdaddr); id_addr_type = BDADDR_BREDR; } else { if (!lmp_le_capable(d)) continue; hci_copy_identity_address(d, &id_addr, &id_addr_type); /* Convert from HCI to three-value type */ if (id_addr_type == ADDR_LE_DEV_PUBLIC) id_addr_type = BDADDR_LE_PUBLIC; else id_addr_type = BDADDR_LE_RANDOM; } if (!bacmp(&id_addr, src) && id_addr_type == src_type) { hdev = d; break; } } else { if (bacmp(&d->bdaddr, dst)) { hdev = d; break; } } } if (hdev) hdev = hci_dev_hold(hdev); read_unlock(&hci_dev_list_lock); return hdev; } EXPORT_SYMBOL(hci_get_route); /* This function requires the caller holds hdev->lock */ void hci_le_conn_failed(struct hci_conn *conn, u8 status) { struct hci_dev *hdev = conn->hdev; struct hci_conn_params *params; params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst, conn->dst_type); if (params && params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); params->conn = NULL; } conn->state = BT_CLOSED; /* If the status indicates successful cancellation of * the attempt (i.e. Unkown Connection Id) there's no point of * notifying failure since we'll go back to keep trying to * connect. The only exception is explicit connect requests * where a timeout + cancel does indicate an actual failure. */ if (status != HCI_ERROR_UNKNOWN_CONN_ID || (params && params->explicit_connect)) mgmt_connect_failed(hdev, &conn->dst, conn->type, conn->dst_type, status); hci_connect_cfm(conn, status); hci_conn_del(conn); /* Since we may have temporarily stopped the background scanning in * favor of connection establishment, we should restart it. */ hci_update_background_scan(hdev); /* Re-enable advertising in case this was a failed connection * attempt as a peripheral. */ hci_req_reenable_advertising(hdev); } static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct hci_conn *conn; hci_dev_lock(hdev); conn = hci_lookup_le_connect(hdev); if (!status) { hci_connect_le_scan_cleanup(conn); goto done; } bt_dev_err(hdev, "request failed to create LE connection: " "status 0x%2.2x", status); if (!conn) goto done; hci_le_conn_failed(conn, status); done: hci_dev_unlock(hdev); } static bool conn_use_rpa(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; return hci_dev_test_flag(hdev, HCI_PRIVACY); } static void set_ext_conn_params(struct hci_conn *conn, struct hci_cp_le_ext_conn_param *p) { struct hci_dev *hdev = conn->hdev; memset(p, 0, sizeof(*p)); /* Set window to be the same value as the interval to * enable continuous scanning. */ p->scan_interval = cpu_to_le16(hdev->le_scan_interval); p->scan_window = p->scan_interval; p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); p->conn_latency = cpu_to_le16(conn->le_conn_latency); p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout); p->min_ce_len = cpu_to_le16(0x0000); p->max_ce_len = cpu_to_le16(0x0000); } static void hci_req_add_le_create_conn(struct hci_request *req, struct hci_conn *conn, bdaddr_t *direct_rpa) { struct hci_dev *hdev = conn->hdev; u8 own_addr_type; /* If direct address was provided we use it instead of current * address. */ if (direct_rpa) { if (bacmp(&req->hdev->random_addr, direct_rpa)) hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, direct_rpa); /* direct address is always RPA */ own_addr_type = ADDR_LE_DEV_RANDOM; } else { /* Update random address, but set require_privacy to false so * that we never connect with an non-resolvable address. */ if (hci_update_random_address(req, false, conn_use_rpa(conn), &own_addr_type)) return; } if (use_ext_conn(hdev)) { struct hci_cp_le_ext_create_conn *cp; struct hci_cp_le_ext_conn_param *p; u8 data[sizeof(*cp) + sizeof(*p) * 3]; u32 plen; cp = (void *) data; p = (void *) cp->data; memset(cp, 0, sizeof(*cp)); bacpy(&cp->peer_addr, &conn->dst); cp->peer_addr_type = conn->dst_type; cp->own_addr_type = own_addr_type; plen = sizeof(*cp); if (scan_1m(hdev)) { cp->phys |= LE_SCAN_PHY_1M; set_ext_conn_params(conn, p); p++; plen += sizeof(*p); } if (scan_2m(hdev)) { cp->phys |= LE_SCAN_PHY_2M; set_ext_conn_params(conn, p); p++; plen += sizeof(*p); } if (scan_coded(hdev)) { cp->phys |= LE_SCAN_PHY_CODED; set_ext_conn_params(conn, p); plen += sizeof(*p); } hci_req_add(req, HCI_OP_LE_EXT_CREATE_CONN, plen, data); } else { struct hci_cp_le_create_conn cp; memset(&cp, 0, sizeof(cp)); /* Set window to be the same value as the interval to enable * continuous scanning. */ cp.scan_interval = cpu_to_le16(hdev->le_scan_interval); cp.scan_window = cp.scan_interval; bacpy(&cp.peer_addr, &conn->dst); cp.peer_addr_type = conn->dst_type; cp.own_address_type = own_addr_type; cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval); cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval); cp.conn_latency = cpu_to_le16(conn->le_conn_latency); cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp); } conn->state = BT_CONNECT; clear_bit(HCI_CONN_SCANNING, &conn->flags); } static void hci_req_directed_advertising(struct hci_request *req, struct hci_conn *conn) { struct hci_dev *hdev = req->hdev; u8 own_addr_type; u8 enable; if (ext_adv_capable(hdev)) { struct hci_cp_le_set_ext_adv_params cp; bdaddr_t random_addr; /* Set require_privacy to false so that the remote device has a * chance of identifying us. */ if (hci_get_random_address(hdev, false, conn_use_rpa(conn), NULL, &own_addr_type, &random_addr) < 0) return; memset(&cp, 0, sizeof(cp)); cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_DIRECT_IND); cp.own_addr_type = own_addr_type; cp.channel_map = hdev->le_adv_channel_map; cp.tx_power = HCI_TX_POWER_INVALID; cp.primary_phy = HCI_ADV_PHY_1M; cp.secondary_phy = HCI_ADV_PHY_1M; cp.handle = 0; /* Use instance 0 for directed adv */ cp.own_addr_type = own_addr_type; cp.peer_addr_type = conn->dst_type; bacpy(&cp.peer_addr, &conn->dst); hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); if (own_addr_type == ADDR_LE_DEV_RANDOM && bacmp(&random_addr, BDADDR_ANY) && bacmp(&random_addr, &hdev->random_addr)) { struct hci_cp_le_set_adv_set_rand_addr cp; memset(&cp, 0, sizeof(cp)); cp.handle = 0; bacpy(&cp.bdaddr, &random_addr); hci_req_add(req, HCI_OP_LE_SET_ADV_SET_RAND_ADDR, sizeof(cp), &cp); } __hci_req_enable_ext_advertising(req); } else { struct hci_cp_le_set_adv_param cp; /* Clear the HCI_LE_ADV bit temporarily so that the * hci_update_random_address knows that it's safe to go ahead * and write a new random address. The flag will be set back on * as soon as the SET_ADV_ENABLE HCI command completes. */ hci_dev_clear_flag(hdev, HCI_LE_ADV); /* Set require_privacy to false so that the remote device has a * chance of identifying us. */ if (hci_update_random_address(req, false, conn_use_rpa(conn), &own_addr_type) < 0) return; memset(&cp, 0, sizeof(cp)); /* Some controllers might reject command if intervals are not * within range for undirected advertising. * BCM20702A0 is known to be affected by this. */ cp.min_interval = cpu_to_le16(0x0020); cp.max_interval = cpu_to_le16(0x0020); cp.type = LE_ADV_DIRECT_IND; cp.own_address_type = own_addr_type; cp.direct_addr_type = conn->dst_type; bacpy(&cp.direct_addr, &conn->dst); cp.channel_map = hdev->le_adv_channel_map; hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp); enable = 0x01; hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); } conn->state = BT_CONNECT; } struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, u16 conn_timeout, u8 role, bdaddr_t *direct_rpa) { struct hci_conn_params *params; struct hci_conn *conn; struct smp_irk *irk; struct hci_request req; int err; /* Let's make sure that le is enabled.*/ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { if (lmp_le_capable(hdev)) return ERR_PTR(-ECONNREFUSED); return ERR_PTR(-EOPNOTSUPP); } /* Since the controller supports only one LE connection attempt at a * time, we return -EBUSY if there is any connection attempt running. */ if (hci_lookup_le_connect(hdev)) return ERR_PTR(-EBUSY); /* If there's already a connection object but it's not in * scanning state it means it must already be established, in * which case we can't do anything else except report a failure * to connect. */ conn = hci_conn_hash_lookup_le(hdev, dst, dst_type); if (conn && !test_bit(HCI_CONN_SCANNING, &conn->flags)) { return ERR_PTR(-EBUSY); } /* When given an identity address with existing identity * resolving key, the connection needs to be established * to a resolvable random address. * * Storing the resolvable random address is required here * to handle connection failures. The address will later * be resolved back into the original identity address * from the connect request. */ irk = hci_find_irk_by_addr(hdev, dst, dst_type); if (irk && bacmp(&irk->rpa, BDADDR_ANY)) { dst = &irk->rpa; dst_type = ADDR_LE_DEV_RANDOM; } if (conn) { bacpy(&conn->dst, dst); } else { conn = hci_conn_add(hdev, LE_LINK, dst, role); if (!conn) return ERR_PTR(-ENOMEM); hci_conn_hold(conn); conn->pending_sec_level = sec_level; } conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->conn_timeout = conn_timeout; hci_req_init(&req, hdev); /* Disable advertising if we're active. For master role * connections most controllers will refuse to connect if * advertising is enabled, and for slave role connections we * anyway have to disable it in order to start directed * advertising. */ if (hci_dev_test_flag(hdev, HCI_LE_ADV)) { u8 enable = 0x00; hci_req_add(&req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); } /* If requested to connect as slave use directed advertising */ if (conn->role == HCI_ROLE_SLAVE) { /* If we're active scanning most controllers are unable * to initiate advertising. Simply reject the attempt. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) && hdev->le_scan_type == LE_SCAN_ACTIVE) { hci_req_purge(&req); hci_conn_del(conn); return ERR_PTR(-EBUSY); } hci_req_directed_advertising(&req, conn); goto create_conn; } params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); if (params) { conn->le_conn_min_interval = params->conn_min_interval; conn->le_conn_max_interval = params->conn_max_interval; conn->le_conn_latency = params->conn_latency; conn->le_supv_timeout = params->supervision_timeout; } else { conn->le_conn_min_interval = hdev->le_conn_min_interval; conn->le_conn_max_interval = hdev->le_conn_max_interval; conn->le_conn_latency = hdev->le_conn_latency; conn->le_supv_timeout = hdev->le_supv_timeout; } /* If controller is scanning, we stop it since some controllers are * not able to scan and connect at the same time. Also set the * HCI_LE_SCAN_INTERRUPTED flag so that the command complete * handler for scan disabling knows to set the correct discovery * state. */ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { hci_req_add_le_scan_disable(&req); hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); } hci_req_add_le_create_conn(&req, conn, direct_rpa); create_conn: err = hci_req_run(&req, create_le_conn_complete); if (err) { hci_conn_del(conn); return ERR_PTR(err); } return conn; } static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) { struct hci_conn *conn; conn = hci_conn_hash_lookup_le(hdev, addr, type); if (!conn) return false; if (conn->state != BT_CONNECTED) return false; return true; } /* This function requires the caller holds hdev->lock */ static int hci_explicit_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; if (is_connected(hdev, addr, addr_type)) return -EISCONN; params = hci_conn_params_lookup(hdev, addr, addr_type); if (!params) { params = hci_conn_params_add(hdev, addr, addr_type); if (!params) return -ENOMEM; /* If we created new params, mark them to be deleted in * hci_connect_le_scan_cleanup. It's different case than * existing disabled params, those will stay after cleanup. */ params->auto_connect = HCI_AUTO_CONN_EXPLICIT; } /* We're trying to connect, so make sure params are at pend_le_conns */ if (params->auto_connect == HCI_AUTO_CONN_DISABLED || params->auto_connect == HCI_AUTO_CONN_REPORT || params->auto_connect == HCI_AUTO_CONN_EXPLICIT) { list_del_init(¶ms->action); list_add(¶ms->action, &hdev->pend_le_conns); } params->explicit_connect = true; BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type, params->auto_connect); return 0; } /* This function requires the caller holds hdev->lock */ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, u16 conn_timeout) { struct hci_conn *conn; /* Let's make sure that le is enabled.*/ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { if (lmp_le_capable(hdev)) return ERR_PTR(-ECONNREFUSED); return ERR_PTR(-EOPNOTSUPP); } /* Some devices send ATT messages as soon as the physical link is * established. To be able to handle these ATT messages, the user- * space first establishes the connection and then starts the pairing * process. * * So if a hci_conn object already exists for the following connection * attempt, we simply update pending_sec_level and auth_type fields * and return the object found. */ conn = hci_conn_hash_lookup_le(hdev, dst, dst_type); if (conn) { if (conn->pending_sec_level < sec_level) conn->pending_sec_level = sec_level; goto done; } BT_DBG("requesting refresh of dst_addr"); conn = hci_conn_add(hdev, LE_LINK, dst, HCI_ROLE_MASTER); if (!conn) return ERR_PTR(-ENOMEM); if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) { hci_conn_del(conn); return ERR_PTR(-EBUSY); } conn->state = BT_CONNECT; set_bit(HCI_CONN_SCANNING, &conn->flags); conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->pending_sec_level = sec_level; conn->conn_timeout = conn_timeout; hci_update_background_scan(hdev); done: hci_conn_hold(conn); return conn; } struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, u8 sec_level, u8 auth_type) { struct hci_conn *acl; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { if (lmp_bredr_capable(hdev)) return ERR_PTR(-ECONNREFUSED); return ERR_PTR(-EOPNOTSUPP); } acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); if (!acl) { acl = hci_conn_add(hdev, ACL_LINK, dst, HCI_ROLE_MASTER); if (!acl) return ERR_PTR(-ENOMEM); } hci_conn_hold(acl); if (acl->state == BT_OPEN || acl->state == BT_CLOSED) { acl->sec_level = BT_SECURITY_LOW; acl->pending_sec_level = sec_level; acl->auth_type = auth_type; hci_acl_create_connection(acl); } return acl; } struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, __u16 setting) { struct hci_conn *acl; struct hci_conn *sco; acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING); if (IS_ERR(acl)) return acl; sco = hci_conn_hash_lookup_ba(hdev, type, dst); if (!sco) { sco = hci_conn_add(hdev, type, dst, HCI_ROLE_MASTER); if (!sco) { hci_conn_drop(acl); return ERR_PTR(-ENOMEM); } } acl->link = sco; sco->link = acl; hci_conn_hold(sco); sco->setting = setting; if (acl->state == BT_CONNECTED && (sco->state == BT_OPEN || sco->state == BT_CLOSED)) { set_bit(HCI_CONN_POWER_SAVE, &acl->flags); hci_conn_enter_active_mode(acl, BT_POWER_FORCE_ACTIVE_ON); if (test_bit(HCI_CONN_MODE_CHANGE_PEND, &acl->flags)) { /* defer SCO setup until mode change completed */ set_bit(HCI_CONN_SCO_SETUP_PEND, &acl->flags); return sco; } hci_sco_setup(acl, 0x00); } return sco; } /* Check link security requirement */ int hci_conn_check_link_mode(struct hci_conn *conn) { BT_DBG("hcon %p", conn); /* In Secure Connections Only mode, it is required that Secure * Connections is used and the link is encrypted with AES-CCM * using a P-256 authenticated combination key. */ if (hci_dev_test_flag(conn->hdev, HCI_SC_ONLY)) { if (!hci_conn_sc_enabled(conn) || !test_bit(HCI_CONN_AES_CCM, &conn->flags) || conn->key_type != HCI_LK_AUTH_COMBINATION_P256) return 0; } /* AES encryption is required for Level 4: * * BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 3, Part C * page 1319: * * 128-bit equivalent strength for link and encryption keys * required using FIPS approved algorithms (E0 not allowed, * SAFER+ not allowed, and P-192 not allowed; encryption key * not shortened) */ if (conn->sec_level == BT_SECURITY_FIPS && !test_bit(HCI_CONN_AES_CCM, &conn->flags)) { bt_dev_err(conn->hdev, "Invalid security: Missing AES-CCM usage"); return 0; } if (hci_conn_ssp_enabled(conn) && !test_bit(HCI_CONN_ENCRYPT, &conn->flags)) return 0; return 1; } /* Authenticate remote device */ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) { BT_DBG("hcon %p", conn); if (conn->pending_sec_level > sec_level) sec_level = conn->pending_sec_level; if (sec_level > conn->sec_level) conn->pending_sec_level = sec_level; else if (test_bit(HCI_CONN_AUTH, &conn->flags)) return 1; /* Make sure we preserve an existing MITM requirement*/ auth_type |= (conn->auth_type & 0x01); conn->auth_type = auth_type; if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) { struct hci_cp_auth_requested cp; cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); /* If we're already encrypted set the REAUTH_PEND flag, * otherwise set the ENCRYPT_PEND. */ if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) set_bit(HCI_CONN_REAUTH_PEND, &conn->flags); else set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); } return 0; } /* Encrypt the the link */ static void hci_conn_encrypt(struct hci_conn *conn) { BT_DBG("hcon %p", conn); if (!test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) { struct hci_cp_set_conn_encrypt cp; cp.handle = cpu_to_le16(conn->handle); cp.encrypt = 0x01; hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp), &cp); } } /* Enable security */ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, bool initiator) { BT_DBG("hcon %p", conn); if (conn->type == LE_LINK) return smp_conn_security(conn, sec_level); /* For sdp we don't need the link key. */ if (sec_level == BT_SECURITY_SDP) return 1; /* For non 2.1 devices and low security level we don't need the link key. */ if (sec_level == BT_SECURITY_LOW && !hci_conn_ssp_enabled(conn)) return 1; /* For other security levels we need the link key. */ if (!test_bit(HCI_CONN_AUTH, &conn->flags)) goto auth; /* An authenticated FIPS approved combination key has sufficient * security for security level 4. */ if (conn->key_type == HCI_LK_AUTH_COMBINATION_P256 && sec_level == BT_SECURITY_FIPS) goto encrypt; /* An authenticated combination key has sufficient security for security level 3. */ if ((conn->key_type == HCI_LK_AUTH_COMBINATION_P192 || conn->key_type == HCI_LK_AUTH_COMBINATION_P256) && sec_level == BT_SECURITY_HIGH) goto encrypt; /* An unauthenticated combination key has sufficient security for security level 1 and 2. */ if ((conn->key_type == HCI_LK_UNAUTH_COMBINATION_P192 || conn->key_type == HCI_LK_UNAUTH_COMBINATION_P256) && (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW)) goto encrypt; /* A combination key has always sufficient security for the security levels 1 or 2. High security level requires the combination key is generated using maximum PIN code length (16). For pre 2.1 units. */ if (conn->key_type == HCI_LK_COMBINATION && (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW || conn->pin_length == 16)) goto encrypt; auth: if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) return 0; if (initiator) set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags); if (!hci_conn_auth(conn, sec_level, auth_type)) return 0; encrypt: if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) { /* Ensure that the encryption key size has been read, * otherwise stall the upper layer responses. */ if (!conn->enc_key_size) return 0; /* Nothing else needed, all requirements are met */ return 1; } hci_conn_encrypt(conn); return 0; } EXPORT_SYMBOL(hci_conn_security); /* Check secure link requirement */ int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level) { BT_DBG("hcon %p", conn); /* Accept if non-secure or higher security level is required */ if (sec_level != BT_SECURITY_HIGH && sec_level != BT_SECURITY_FIPS) return 1; /* Accept if secure or higher security level is already present */ if (conn->sec_level == BT_SECURITY_HIGH || conn->sec_level == BT_SECURITY_FIPS) return 1; /* Reject not secure link */ return 0; } EXPORT_SYMBOL(hci_conn_check_secure); /* Switch role */ int hci_conn_switch_role(struct hci_conn *conn, __u8 role) { BT_DBG("hcon %p", conn); if (role == conn->role) return 1; if (!test_and_set_bit(HCI_CONN_RSWITCH_PEND, &conn->flags)) { struct hci_cp_switch_role cp; bacpy(&cp.bdaddr, &conn->dst); cp.role = role; hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp); } return 0; } EXPORT_SYMBOL(hci_conn_switch_role); /* Enter active mode */ void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active) { struct hci_dev *hdev = conn->hdev; BT_DBG("hcon %p mode %d", conn, conn->mode); if (conn->mode != HCI_CM_SNIFF) goto timer; if (!test_bit(HCI_CONN_POWER_SAVE, &conn->flags) && !force_active) goto timer; if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags)) { struct hci_cp_exit_sniff_mode cp; cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(hdev, HCI_OP_EXIT_SNIFF_MODE, sizeof(cp), &cp); } timer: if (hdev->idle_timeout > 0) queue_delayed_work(hdev->workqueue, &conn->idle_work, msecs_to_jiffies(hdev->idle_timeout)); } /* Drop all connection on the device */ void hci_conn_hash_flush(struct hci_dev *hdev) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c, *n; BT_DBG("hdev %s", hdev->name); list_for_each_entry_safe(c, n, &h->list, list) { c->state = BT_CLOSED; hci_disconn_cfm(c, HCI_ERROR_LOCAL_HOST_TERM); hci_conn_del(c); } } /* Check pending connect attempts */ void hci_conn_check_pending(struct hci_dev *hdev) { struct hci_conn *conn; BT_DBG("hdev %s", hdev->name); hci_dev_lock(hdev); conn = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2); if (conn) hci_acl_create_connection(conn); hci_dev_unlock(hdev); } static u32 get_link_mode(struct hci_conn *conn) { u32 link_mode = 0; if (conn->role == HCI_ROLE_MASTER) link_mode |= HCI_LM_MASTER; if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) link_mode |= HCI_LM_ENCRYPT; if (test_bit(HCI_CONN_AUTH, &conn->flags)) link_mode |= HCI_LM_AUTH; if (test_bit(HCI_CONN_SECURE, &conn->flags)) link_mode |= HCI_LM_SECURE; if (test_bit(HCI_CONN_FIPS, &conn->flags)) link_mode |= HCI_LM_FIPS; return link_mode; } int hci_get_conn_list(void __user *arg) { struct hci_conn *c; struct hci_conn_list_req req, *cl; struct hci_conn_info *ci; struct hci_dev *hdev; int n = 0, size, err; if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; if (!req.conn_num || req.conn_num > (PAGE_SIZE * 2) / sizeof(*ci)) return -EINVAL; size = sizeof(req) + req.conn_num * sizeof(*ci); cl = kmalloc(size, GFP_KERNEL); if (!cl) return -ENOMEM; hdev = hci_dev_get(req.dev_id); if (!hdev) { kfree(cl); return -ENODEV; } ci = cl->conn_info; hci_dev_lock(hdev); list_for_each_entry(c, &hdev->conn_hash.list, list) { bacpy(&(ci + n)->bdaddr, &c->dst); (ci + n)->handle = c->handle; (ci + n)->type = c->type; (ci + n)->out = c->out; (ci + n)->state = c->state; (ci + n)->link_mode = get_link_mode(c); if (++n >= req.conn_num) break; } hci_dev_unlock(hdev); cl->dev_id = hdev->id; cl->conn_num = n; size = sizeof(req) + n * sizeof(*ci); hci_dev_put(hdev); err = copy_to_user(arg, cl, size); kfree(cl); return err ? -EFAULT : 0; } int hci_get_conn_info(struct hci_dev *hdev, void __user *arg) { struct hci_conn_info_req req; struct hci_conn_info ci; struct hci_conn *conn; char __user *ptr = arg + sizeof(req); if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, req.type, &req.bdaddr); if (conn) { bacpy(&ci.bdaddr, &conn->dst); ci.handle = conn->handle; ci.type = conn->type; ci.out = conn->out; ci.state = conn->state; ci.link_mode = get_link_mode(conn); } hci_dev_unlock(hdev); if (!conn) return -ENOENT; return copy_to_user(ptr, &ci, sizeof(ci)) ? -EFAULT : 0; } int hci_get_auth_info(struct hci_dev *hdev, void __user *arg) { struct hci_auth_info_req req; struct hci_conn *conn; if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &req.bdaddr); if (conn) req.type = conn->auth_type; hci_dev_unlock(hdev); if (!conn) return -ENOENT; return copy_to_user(arg, &req, sizeof(req)) ? -EFAULT : 0; } struct hci_chan *hci_chan_create(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; struct hci_chan *chan; BT_DBG("%s hcon %p", hdev->name, conn); if (test_bit(HCI_CONN_DROP, &conn->flags)) { BT_DBG("Refusing to create new hci_chan"); return NULL; } chan = kzalloc(sizeof(*chan), GFP_KERNEL); if (!chan) return NULL; chan->conn = hci_conn_get(conn); skb_queue_head_init(&chan->data_q); chan->state = BT_CONNECTED; list_add_rcu(&chan->list, &conn->chan_list); return chan; } void hci_chan_del(struct hci_chan *chan) { struct hci_conn *conn = chan->conn; struct hci_dev *hdev = conn->hdev; BT_DBG("%s hcon %p chan %p", hdev->name, conn, chan); list_del_rcu(&chan->list); synchronize_rcu(); /* Prevent new hci_chan's to be created for this hci_conn */ set_bit(HCI_CONN_DROP, &conn->flags); hci_conn_put(conn); skb_queue_purge(&chan->data_q); kfree(chan); } void hci_chan_list_flush(struct hci_conn *conn) { struct hci_chan *chan, *n; BT_DBG("hcon %p", conn); list_for_each_entry_safe(chan, n, &conn->chan_list, list) hci_chan_del(chan); } static struct hci_chan *__hci_chan_lookup_handle(struct hci_conn *hcon, __u16 handle) { struct hci_chan *hchan; list_for_each_entry(hchan, &hcon->chan_list, list) { if (hchan->handle == handle) return hchan; } return NULL; } struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *hcon; struct hci_chan *hchan = NULL; rcu_read_lock(); list_for_each_entry_rcu(hcon, &h->list, list) { hchan = __hci_chan_lookup_handle(hcon, handle); if (hchan) break; } rcu_read_unlock(); return hchan; } |
62 57 6 57 12 12 4 6 6 6 6 6 6 6 6 6 6 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2011 STRATO AG * written by Arne Jansen <sensille@gmx.net> */ #include <linux/slab.h> #include "ulist.h" #include "ctree.h" /* * ulist is a generic data structure to hold a collection of unique u64 * values. The only operations it supports is adding to the list and * enumerating it. * It is possible to store an auxiliary value along with the key. * * A sample usage for ulists is the enumeration of directed graphs without * visiting a node twice. The pseudo-code could look like this: * * ulist = ulist_alloc(); * ulist_add(ulist, root); * ULIST_ITER_INIT(&uiter); * * while ((elem = ulist_next(ulist, &uiter)) { * for (all child nodes n in elem) * ulist_add(ulist, n); * do something useful with the node; * } * ulist_free(ulist); * * This assumes the graph nodes are addressable by u64. This stems from the * usage for tree enumeration in btrfs, where the logical addresses are * 64 bit. * * It is also useful for tree enumeration which could be done elegantly * recursively, but is not possible due to kernel stack limitations. The * loop would be similar to the above. */ /** * ulist_init - freshly initialize a ulist * @ulist: the ulist to initialize * * Note: don't use this function to init an already used ulist, use * ulist_reinit instead. */ void ulist_init(struct ulist *ulist) { INIT_LIST_HEAD(&ulist->nodes); ulist->root = RB_ROOT; ulist->nnodes = 0; } /** * ulist_release - free up additionally allocated memory for the ulist * @ulist: the ulist from which to free the additional memory * * This is useful in cases where the base 'struct ulist' has been statically * allocated. */ void ulist_release(struct ulist *ulist) { struct ulist_node *node; struct ulist_node *next; list_for_each_entry_safe(node, next, &ulist->nodes, list) { kfree(node); } ulist->root = RB_ROOT; INIT_LIST_HEAD(&ulist->nodes); } /** * ulist_reinit - prepare a ulist for reuse * @ulist: ulist to be reused * * Free up all additional memory allocated for the list elements and reinit * the ulist. */ void ulist_reinit(struct ulist *ulist) { ulist_release(ulist); ulist_init(ulist); } /** * ulist_alloc - dynamically allocate a ulist * @gfp_mask: allocation flags to for base allocation * * The allocated ulist will be returned in an initialized state. */ struct ulist *ulist_alloc(gfp_t gfp_mask) { struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask); if (!ulist) return NULL; ulist_init(ulist); return ulist; } /** * ulist_free - free dynamically allocated ulist * @ulist: ulist to free * * It is not necessary to call ulist_release before. */ void ulist_free(struct ulist *ulist) { if (!ulist) return; ulist_release(ulist); kfree(ulist); } static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) { struct rb_node *n = ulist->root.rb_node; struct ulist_node *u = NULL; while (n) { u = rb_entry(n, struct ulist_node, rb_node); if (u->val < val) n = n->rb_right; else if (u->val > val) n = n->rb_left; else return u; } return NULL; } static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) { rb_erase(&node->rb_node, &ulist->root); list_del(&node->list); kfree(node); BUG_ON(ulist->nnodes == 0); ulist->nnodes--; } static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) { struct rb_node **p = &ulist->root.rb_node; struct rb_node *parent = NULL; struct ulist_node *cur = NULL; while (*p) { parent = *p; cur = rb_entry(parent, struct ulist_node, rb_node); if (cur->val < ins->val) p = &(*p)->rb_right; else if (cur->val > ins->val) p = &(*p)->rb_left; else return -EEXIST; } rb_link_node(&ins->rb_node, parent, p); rb_insert_color(&ins->rb_node, &ulist->root); return 0; } /** * ulist_add - add an element to the ulist * @ulist: ulist to add the element to * @val: value to add to ulist * @aux: auxiliary value to store along with val * @gfp_mask: flags to use for allocation * * Note: locking must be provided by the caller. In case of rwlocks write * locking is needed * * Add an element to a ulist. The @val will only be added if it doesn't * already exist. If it is added, the auxiliary value @aux is stored along with * it. In case @val already exists in the ulist, @aux is ignored, even if * it differs from the already stored value. * * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been * inserted. * In case of allocation failure -ENOMEM is returned and the ulist stays * unaltered. */ int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask) { return ulist_add_merge(ulist, val, aux, NULL, gfp_mask); } int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask) { int ret; struct ulist_node *node; node = ulist_rbtree_search(ulist, val); if (node) { if (old_aux) *old_aux = node->aux; return 0; } node = kmalloc(sizeof(*node), gfp_mask); if (!node) return -ENOMEM; node->val = val; node->aux = aux; ret = ulist_rbtree_insert(ulist, node); ASSERT(!ret); list_add_tail(&node->list, &ulist->nodes); ulist->nnodes++; return 1; } /* * ulist_del - delete one node from ulist * @ulist: ulist to remove node from * @val: value to delete * @aux: aux to delete * * The deletion will only be done when *BOTH* val and aux matches. * Return 0 for successful delete. * Return > 0 for not found. */ int ulist_del(struct ulist *ulist, u64 val, u64 aux) { struct ulist_node *node; node = ulist_rbtree_search(ulist, val); /* Not found */ if (!node) return 1; if (node->aux != aux) return 1; /* Found and delete */ ulist_rbtree_erase(ulist, node); return 0; } /** * ulist_next - iterate ulist * @ulist: ulist to iterate * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator) * * Note: locking must be provided by the caller. In case of rwlocks only read * locking is needed * * This function is used to iterate an ulist. * It returns the next element from the ulist or %NULL when the * end is reached. No guarantee is made with respect to the order in which * the elements are returned. They might neither be returned in order of * addition nor in ascending order. * It is allowed to call ulist_add during an enumeration. Newly added items * are guaranteed to show up in the running enumeration. */ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) { struct ulist_node *node; if (list_empty(&ulist->nodes)) return NULL; if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes) return NULL; if (uiter->cur_list) { uiter->cur_list = uiter->cur_list->next; } else { uiter->cur_list = ulist->nodes.next; } node = list_entry(uiter->cur_list, struct ulist_node, list); return node; } |
336 320 320 266 7 165 93 59 134 102 44 8 1 176 59 4 4 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 | /* * net/tipc/msg.h: Include file for TIPC message header routines * * Copyright (c) 2000-2007, 2014-2017 Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_MSG_H #define _TIPC_MSG_H #include <linux/tipc.h> #include "core.h" /* * Constants and routines used to read and write TIPC payload message headers * * Note: Some items are also used with TIPC internal message headers */ #define TIPC_VERSION 2 struct plist; /* * Payload message users are defined in TIPC's public API: * - TIPC_LOW_IMPORTANCE * - TIPC_MEDIUM_IMPORTANCE * - TIPC_HIGH_IMPORTANCE * - TIPC_CRITICAL_IMPORTANCE */ #define TIPC_SYSTEM_IMPORTANCE 4 /* * Payload message types */ #define TIPC_CONN_MSG 0 #define TIPC_MCAST_MSG 1 #define TIPC_NAMED_MSG 2 #define TIPC_DIRECT_MSG 3 #define TIPC_GRP_MEMBER_EVT 4 #define TIPC_GRP_BCAST_MSG 5 #define TIPC_GRP_MCAST_MSG 6 #define TIPC_GRP_UCAST_MSG 7 /* * Internal message users */ #define BCAST_PROTOCOL 5 #define MSG_BUNDLER 6 #define LINK_PROTOCOL 7 #define CONN_MANAGER 8 #define GROUP_PROTOCOL 9 #define TUNNEL_PROTOCOL 10 #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 #define SOCK_WAKEUP 14 /* pseudo user */ #define TOP_SRV 15 /* pseudo user */ /* * Message header sizes */ #define SHORT_H_SIZE 24 /* In-cluster basic payload message */ #define BASIC_H_SIZE 32 /* Basic payload message */ #define NAMED_H_SIZE 40 /* Named payload message */ #define MCAST_H_SIZE 44 /* Multicast payload message */ #define GROUP_H_SIZE 44 /* Group payload message */ #define INT_H_SIZE 40 /* Internal messages */ #define MIN_H_SIZE 24 /* Smallest legal TIPC header size */ #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) #define FB_MTU 3744 #define TIPC_MEDIA_INFO_OFFSET 5 struct tipc_skb_cb { u32 bytes_read; u32 orig_member; struct sk_buff *tail; bool validated; u16 chain_imp; u16 ackers; }; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) struct tipc_msg { __be32 hdr[15]; }; static inline struct tipc_msg *buf_msg(struct sk_buff *skb) { return (struct tipc_msg *)skb->data; } static inline u32 msg_word(struct tipc_msg *m, u32 pos) { return ntohl(m->hdr[pos]); } static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val) { m->hdr[w] = htonl(val); } static inline u32 msg_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask) { return (msg_word(m, w) >> pos) & mask; } static inline void msg_set_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask, u32 val) { val = (val & mask) << pos; mask = mask << pos; m->hdr[w] &= ~htonl(mask); m->hdr[w] |= htonl(val); } static inline void msg_swap_words(struct tipc_msg *msg, u32 a, u32 b) { u32 temp = msg->hdr[a]; msg->hdr[a] = msg->hdr[b]; msg->hdr[b] = temp; } /* * Word 0 */ static inline u32 msg_version(struct tipc_msg *m) { return msg_bits(m, 0, 29, 7); } static inline void msg_set_version(struct tipc_msg *m) { msg_set_bits(m, 0, 29, 7, TIPC_VERSION); } static inline u32 msg_user(struct tipc_msg *m) { return msg_bits(m, 0, 25, 0xf); } static inline u32 msg_isdata(struct tipc_msg *m) { return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE; } static inline void msg_set_user(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 25, 0xf, n); } static inline u32 msg_hdr_sz(struct tipc_msg *m) { return msg_bits(m, 0, 21, 0xf) << 2; } static inline void msg_set_hdr_sz(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 21, 0xf, n>>2); } static inline u32 msg_size(struct tipc_msg *m) { return msg_bits(m, 0, 0, 0x1ffff); } static inline u32 msg_blocks(struct tipc_msg *m) { return (msg_size(m) / 1024) + 1; } static inline u32 msg_data_sz(struct tipc_msg *m) { return msg_size(m) - msg_hdr_sz(m); } static inline int msg_non_seq(struct tipc_msg *m) { return msg_bits(m, 0, 20, 1); } static inline void msg_set_non_seq(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 20, 1, n); } static inline int msg_dest_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_is_keepalive(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_is_keepalive(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_src_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 18, 1, d); } static inline void msg_set_size(struct tipc_msg *m, u32 sz) { m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz); } static inline unchar *msg_data(struct tipc_msg *m) { return ((unchar *)m) + msg_hdr_sz(m); } static inline struct tipc_msg *msg_get_wrapped(struct tipc_msg *m) { return (struct tipc_msg *)msg_data(m); } /* * Word 1 */ static inline u32 msg_type(struct tipc_msg *m) { return msg_bits(m, 1, 29, 0x7); } static inline void msg_set_type(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 29, 0x7, n); } static inline int msg_in_group(struct tipc_msg *m) { int mtyp = msg_type(m); return mtyp >= TIPC_GRP_MEMBER_EVT && mtyp <= TIPC_GRP_UCAST_MSG; } static inline bool msg_is_grp_evt(struct tipc_msg *m) { return msg_type(m) == TIPC_GRP_MEMBER_EVT; } static inline u32 msg_named(struct tipc_msg *m) { return msg_type(m) == TIPC_NAMED_MSG; } static inline u32 msg_mcast(struct tipc_msg *m) { int mtyp = msg_type(m); return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG) || (mtyp == TIPC_GRP_MCAST_MSG)); } static inline u32 msg_connected(struct tipc_msg *m) { return msg_type(m) == TIPC_CONN_MSG; } static inline u32 msg_errcode(struct tipc_msg *m) { return msg_bits(m, 1, 25, 0xf); } static inline void msg_set_errcode(struct tipc_msg *m, u32 err) { msg_set_bits(m, 1, 25, 0xf, err); } static inline u32 msg_reroute_cnt(struct tipc_msg *m) { return msg_bits(m, 1, 21, 0xf); } static inline void msg_incr_reroute_cnt(struct tipc_msg *m) { msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1); } static inline void msg_reset_reroute_cnt(struct tipc_msg *m) { msg_set_bits(m, 1, 21, 0xf, 0); } static inline u32 msg_lookup_scope(struct tipc_msg *m) { return msg_bits(m, 1, 19, 0x3); } static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 19, 0x3, n); } static inline u16 msg_bcast_ack(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* * Word 2 */ static inline u16 msg_ack(struct tipc_msg *m) { return msg_bits(m, 2, 16, 0xffff); } static inline void msg_set_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u16 msg_seqno(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Words 3-10 */ static inline u32 msg_importance(struct tipc_msg *m) { int usr = msg_user(m); if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m))) return usr; if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)) return msg_bits(m, 9, 0, 0x7); return TIPC_SYSTEM_IMPORTANCE; } static inline void msg_set_importance(struct tipc_msg *m, u32 i) { int usr = msg_user(m); if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))) msg_set_bits(m, 9, 0, 0x7, i); else if (i < TIPC_SYSTEM_IMPORTANCE) msg_set_user(m, i); else pr_warn("Trying to set illegal importance in message\n"); } static inline u32 msg_prevnode(struct tipc_msg *m) { return msg_word(m, 3); } static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 3, a); } static inline u32 msg_origport(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = msg_get_wrapped(m); return msg_word(m, 4); } static inline void msg_set_origport(struct tipc_msg *m, u32 p) { msg_set_word(m, 4, p); } static inline u32 msg_destport(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_destport(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline u32 msg_mc_netid(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline int msg_short(struct tipc_msg *m) { return msg_hdr_sz(m) == SHORT_H_SIZE; } static inline u32 msg_orignode(struct tipc_msg *m) { if (likely(msg_short(m))) return msg_prevnode(m); return msg_word(m, 6); } static inline void msg_set_orignode(struct tipc_msg *m, u32 a) { msg_set_word(m, 6, a); } static inline u32 msg_destnode(struct tipc_msg *m) { return msg_word(m, 7); } static inline void msg_set_destnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 7, a); } static inline u32 msg_nametype(struct tipc_msg *m) { return msg_word(m, 8); } static inline void msg_set_nametype(struct tipc_msg *m, u32 n) { msg_set_word(m, 8, n); } static inline u32 msg_nameinst(struct tipc_msg *m) { return msg_word(m, 9); } static inline u32 msg_namelower(struct tipc_msg *m) { return msg_nameinst(m); } static inline void msg_set_namelower(struct tipc_msg *m, u32 n) { msg_set_word(m, 9, n); } static inline void msg_set_nameinst(struct tipc_msg *m, u32 n) { msg_set_namelower(m, n); } static inline u32 msg_nameupper(struct tipc_msg *m) { return msg_word(m, 10); } static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) { msg_set_word(m, 10, n); } /* * Constants and routines used to read and write TIPC internal message headers */ /* * Connection management protocol message types */ #define CONN_PROBE 0 #define CONN_PROBE_REPLY 1 #define CONN_ACK 2 /* * Name distributor message types */ #define PUBLICATION 0 #define WITHDRAWAL 1 /* * Segmentation message types */ #define FIRST_FRAGMENT 0 #define FRAGMENT 1 #define LAST_FRAGMENT 2 /* * Link management protocol message types */ #define STATE_MSG 0 #define RESET_MSG 1 #define ACTIVATE_MSG 2 /* * Changeover tunnel message types */ #define SYNCH_MSG 0 #define FAILOVER_MSG 1 /* * Config protocol message types */ #define DSC_REQ_MSG 0 #define DSC_RESP_MSG 1 #define DSC_TRIAL_MSG 2 #define DSC_TRIAL_FAIL_MSG 3 /* * Group protocol message types */ #define GRP_JOIN_MSG 0 #define GRP_LEAVE_MSG 1 #define GRP_ADV_MSG 2 #define GRP_ACK_MSG 3 #define GRP_RECLAIM_MSG 4 #define GRP_REMIT_MSG 5 /* * Word 1 */ static inline u32 msg_seq_gap(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1fff); } static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 16, 0x1fff, n); } static inline u32 msg_node_sig(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_node_sig(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 0, 0xffff, n); } static inline u32 msg_node_capabilities(struct tipc_msg *m) { return msg_bits(m, 1, 15, 0x1fff); } static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 15, 0x1fff, n); } /* * Word 2 */ static inline u32 msg_dest_domain(struct tipc_msg *m) { return msg_word(m, 2); } static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n) { msg_set_word(m, 2, n); } static inline u32 msg_bcgap_after(struct tipc_msg *m) { return msg_bits(m, 2, 16, 0xffff); } static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u32 msg_bcgap_to(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Word 4 */ static inline u32 msg_last_bcast(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline u32 msg_bc_snd_nxt(struct tipc_msg *m) { return msg_last_bcast(m) + 1; } static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u16 msg_next_sent(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_next_sent(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline void msg_set_long_msgno(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_bc_netid(struct tipc_msg *m) { return msg_word(m, 4); } static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id) { msg_set_word(m, 4, id); } static inline u32 msg_link_selector(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = (void *)msg_data(m); return msg_bits(m, 4, 0, 1); } /* * Word 5 */ static inline u16 msg_session(struct tipc_msg *m) { return msg_bits(m, 5, 16, 0xffff); } static inline void msg_set_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 5, 16, 0xffff, n); } static inline u32 msg_probe(struct tipc_msg *m) { return msg_bits(m, 5, 0, 1); } static inline void msg_set_probe(struct tipc_msg *m, u32 val) { msg_set_bits(m, 5, 0, 1, val); } static inline char msg_net_plane(struct tipc_msg *m) { return msg_bits(m, 5, 1, 7) + 'A'; } static inline void msg_set_net_plane(struct tipc_msg *m, char n) { msg_set_bits(m, 5, 1, 7, (n - 'A')); } static inline u32 msg_linkprio(struct tipc_msg *m) { return msg_bits(m, 5, 4, 0x1f); } static inline void msg_set_linkprio(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 4, 0x1f, n); } static inline u32 msg_bearer_id(struct tipc_msg *m) { return msg_bits(m, 5, 9, 0x7); } static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 9, 0x7, n); } static inline u32 msg_redundant_link(struct tipc_msg *m) { return msg_bits(m, 5, 12, 0x1); } static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r) { msg_set_bits(m, 5, 12, 0x1, r); } static inline u32 msg_peer_stopping(struct tipc_msg *m) { return msg_bits(m, 5, 13, 0x1); } static inline void msg_set_peer_stopping(struct tipc_msg *m, u32 s) { msg_set_bits(m, 5, 13, 0x1, s); } static inline bool msg_bc_ack_invalid(struct tipc_msg *m) { switch (msg_user(m)) { case BCAST_PROTOCOL: case NAME_DISTRIBUTOR: case LINK_PROTOCOL: return msg_bits(m, 5, 14, 0x1); default: return false; } } static inline void msg_set_bc_ack_invalid(struct tipc_msg *m, bool invalid) { msg_set_bits(m, 5, 14, 0x1, invalid); } static inline char *msg_media_addr(struct tipc_msg *m) { return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET]; } static inline u32 msg_bc_gap(struct tipc_msg *m) { return msg_bits(m, 8, 0, 0x3ff); } static inline void msg_set_bc_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 8, 0, 0x3ff, n); } /* * Word 9 */ static inline u16 msg_msgcnt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u32 msg_conn_ack(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_adv_win(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_adv_win(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u32 msg_max_pkt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff) * 4; } static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, (n / 4)); } static inline u32 msg_link_tolerance(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_bc_acked(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_acked(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_remitted(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_remitted(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } /* Word 10 */ static inline u16 msg_grp_evt(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x3); } static inline void msg_set_grp_evt(struct tipc_msg *m, int n) { msg_set_bits(m, 10, 0, 0x3, n); } static inline u16 msg_grp_bc_ack_req(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x1); } static inline void msg_set_grp_bc_ack_req(struct tipc_msg *m, bool n) { msg_set_bits(m, 10, 0, 0x1, n); } static inline u16 msg_grp_bc_seqno(struct tipc_msg *m) { return msg_bits(m, 10, 16, 0xffff); } static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n) { msg_set_bits(m, 10, 16, 0xffff, n); } static inline bool msg_peer_link_is_up(struct tipc_msg *m) { if (likely(msg_user(m) != LINK_PROTOCOL)) return true; if (msg_type(m) == STATE_MSG) return true; return false; } static inline bool msg_peer_node_is_up(struct tipc_msg *m) { if (msg_peer_link_is_up(m)) return true; return msg_redundant_link(m); } static inline bool msg_is_reset(struct tipc_msg *hdr) { return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); } static inline void msg_set_sugg_node_addr(struct tipc_msg *m, u32 n) { msg_set_word(m, 14, n); } static inline void msg_set_node_id(struct tipc_msg *hdr, u8 *id) { memcpy(msg_data(hdr), id, 16); } static inline u8 *msg_node_id(struct tipc_msg *hdr) { return (u8 *)msg_data(hdr); } struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff **_skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, struct sk_buff_head *xmitq); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu); bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, u32 mtu, u32 dnode); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy); void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb); static inline u16 buf_seqno(struct sk_buff *skb) { return msg_seqno(buf_msg(skb)); } static inline int buf_roundup_len(struct sk_buff *skb) { return (skb->len / 1024 + 1) * 1024; } /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in * Returns pointer to first buffer in list, if any */ static inline struct sk_buff *tipc_skb_peek(struct sk_buff_head *list, spinlock_t *lock) { struct sk_buff *skb; spin_lock_bh(lock); skb = skb_peek(list); if (skb) skb_get(skb); spin_unlock_bh(lock); return skb; } /* tipc_skb_peek_port(): find a destination port, ignoring all destinations * up to and including 'filter'. * Note: ignoring previously tried destinations minimizes the risk of * contention on the socket lock * @list: list to be peeked in * @filter: last destination to be ignored from search * Returns a destination port number, of applicable. */ static inline u32 tipc_skb_peek_port(struct sk_buff_head *list, u32 filter) { struct sk_buff *skb; u32 dport = 0; bool ignore = true; spin_lock_bh(&list->lock); skb_queue_walk(list, skb) { dport = msg_destport(buf_msg(skb)); if (!filter || skb_queue_is_last(list, skb)) break; if (dport == filter) ignore = false; else if (!ignore) break; } spin_unlock_bh(&list->lock); return dport; } /* tipc_skb_dequeue(): unlink first buffer with dest 'dport' from list * @list: list to be unlinked from * @dport: selection criteria for buffer to unlink */ static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list, u32 dport) { struct sk_buff *_skb, *tmp, *skb = NULL; spin_lock_bh(&list->lock); skb_queue_walk_safe(list, _skb, tmp) { if (msg_destport(buf_msg(_skb)) == dport) { __skb_unlink(_skb, list); skb = _skb; break; } } spin_unlock_bh(&list->lock); return skb; } /* tipc_skb_queue_splice_tail - append an skb list to lock protected list * @list: the new list to append. Not lock protected * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail(struct sk_buff_head *list, struct sk_buff_head *head) { spin_lock_bh(&head->lock); skb_queue_splice_tail(list, head); spin_unlock_bh(&head->lock); } /* tipc_skb_queue_splice_tail_init - merge two lock protected skb lists * @list: the new list to add. Lock protected. Will be reinitialized * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { struct sk_buff_head tmp; __skb_queue_head_init(&tmp); spin_lock_bh(&list->lock); skb_queue_splice_tail_init(list, &tmp); spin_unlock_bh(&list->lock); tipc_skb_queue_splice_tail(&tmp, head); } #endif |
7 2 2 7 7 7 6 1 1 1 7 2 2 2 2 2 2 2 2 2 2 34 34 34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | /* * Keyed 32-bit hash function using TEA in a Davis-Meyer function * H0 = Key * Hi = E Mi(Hi-1) + Hi-1 * * (see Applied Cryptography, 2nd edition, p448). * * Jeremy Fitzhardinge <jeremy@zip.com.au> 1998 * * Jeremy has agreed to the contents of reiserfs/README. -Hans * Yura's function is added (04/07/2000) */ #include <linux/kernel.h> #include "reiserfs.h" #include <asm/types.h> #define DELTA 0x9E3779B9 #define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ #define PARTROUNDS 6 /* 6 gets complete mixing */ /* a, b, c, d - data; h0, h1 - accumulated hash */ #define TEACORE(rounds) \ do { \ u32 sum = 0; \ int n = rounds; \ u32 b0, b1; \ \ b0 = h0; \ b1 = h1; \ \ do \ { \ sum += DELTA; \ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ } while(--n); \ \ h0 += b0; \ h1 += b1; \ } while(0) u32 keyed_hash(const signed char *msg, int len) { u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 }; u32 h0 = k[0], h1 = k[1]; u32 a, b, c, d; u32 pad; int i; /* assert(len >= 0 && len < 256); */ pad = (u32) len | ((u32) len << 8); pad |= pad << 16; while (len >= 16) { a = (u32) msg[0] | (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; b = (u32) msg[4] | (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; c = (u32) msg[8] | (u32) msg[9] << 8 | (u32) msg[10] << 16 | (u32) msg[11] << 24; d = (u32) msg[12] | (u32) msg[13] << 8 | (u32) msg[14] << 16 | (u32) msg[15] << 24; TEACORE(PARTROUNDS); len -= 16; msg += 16; } if (len >= 12) { a = (u32) msg[0] | (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; b = (u32) msg[4] | (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; c = (u32) msg[8] | (u32) msg[9] << 8 | (u32) msg[10] << 16 | (u32) msg[11] << 24; d = pad; for (i = 12; i < len; i++) { d <<= 8; d |= msg[i]; } } else if (len >= 8) { a = (u32) msg[0] | (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; b = (u32) msg[4] | (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; c = d = pad; for (i = 8; i < len; i++) { c <<= 8; c |= msg[i]; } } else if (len >= 4) { a = (u32) msg[0] | (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; b = c = d = pad; for (i = 4; i < len; i++) { b <<= 8; b |= msg[i]; } } else { a = b = c = d = pad; for (i = 0; i < len; i++) { a <<= 8; a |= msg[i]; } } TEACORE(FULLROUNDS); /* return 0;*/ return h0 ^ h1; } /* * What follows in this file is copyright 2000 by Hans Reiser, and the * licensing of what follows is governed by reiserfs/README */ u32 yura_hash(const signed char *msg, int len) { int j, pow; u32 a, c; int i; for (pow = 1, i = 1; i < len; i++) pow = pow * 10; if (len == 1) a = msg[0] - 48; else a = (msg[0] - 48) * pow; for (i = 1; i < len; i++) { c = msg[i] - 48; for (pow = 1, j = i; j < len - 1; j++) pow = pow * 10; a = a + c * pow; } for (; i < 40; i++) { c = '0' - 48; for (pow = 1, j = i; j < len - 1; j++) pow = pow * 10; a = a + c * pow; } for (; i < 256; i++) { c = i; for (pow = 1, j = i; j < len - 1; j++) pow = pow * 10; a = a + c * pow; } a = a << 7; return a; } u32 r5_hash(const signed char *msg, int len) { u32 a = 0; while (*msg) { a += *msg << 4; a += *msg >> 4; a *= 11; msg++; } return a; } |
1 1 1 1 26 26 26 26 26 26 26 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 1 2 2 1 1 27 27 27 27 27 27 27 27 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 | // SPDX-License-Identifier: GPL-2.0+ /* * cpfile.c - NILFS checkpoint file. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * * Written by Koji Sato. */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/errno.h> #include "mdt.h" #include "cpfile.h" static inline unsigned long nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile) { return NILFS_MDT(cpfile)->mi_entries_per_block; } /* block number from the beginning of the file */ static unsigned long nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno) { __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); return (unsigned long)tcno; } /* offset in block */ static unsigned long nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno) { __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); } static __u64 nilfs_cpfile_first_checkpoint_in_block(const struct inode *cpfile, unsigned long blkoff) { return (__u64)nilfs_cpfile_checkpoints_per_block(cpfile) * blkoff + 1 - NILFS_MDT(cpfile)->mi_first_entry_offset; } static unsigned long nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile, __u64 curr, __u64 max) { return min_t(__u64, nilfs_cpfile_checkpoints_per_block(cpfile) - nilfs_cpfile_get_offset(cpfile, curr), max - curr); } static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile, __u64 cno) { return nilfs_cpfile_get_blkoff(cpfile, cno) == 0; } static unsigned int nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile, struct buffer_head *bh, void *kaddr, unsigned int n) { struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); unsigned int count; count = le32_to_cpu(cp->cp_checkpoints_count) + n; cp->cp_checkpoints_count = cpu_to_le32(count); return count; } static unsigned int nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile, struct buffer_head *bh, void *kaddr, unsigned int n) { struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); unsigned int count; WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n); count = le32_to_cpu(cp->cp_checkpoints_count) - n; cp->cp_checkpoints_count = cpu_to_le32(count); return count; } static inline struct nilfs_cpfile_header * nilfs_cpfile_block_get_header(const struct inode *cpfile, struct buffer_head *bh, void *kaddr) { return kaddr + bh_offset(bh); } static struct nilfs_checkpoint * nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno, struct buffer_head *bh, void *kaddr) { return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) * NILFS_MDT(cpfile)->mi_entry_size; } static void nilfs_cpfile_block_init(struct inode *cpfile, struct buffer_head *bh, void *kaddr) { struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; int n = nilfs_cpfile_checkpoints_per_block(cpfile); while (n-- > 0) { nilfs_checkpoint_set_invalid(cp); cp = (void *)cp + cpsz; } } static inline int nilfs_cpfile_get_header_block(struct inode *cpfile, struct buffer_head **bhp) { return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp); } static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile, __u64 cno, int create, struct buffer_head **bhp) { return nilfs_mdt_get_block(cpfile, nilfs_cpfile_get_blkoff(cpfile, cno), create, nilfs_cpfile_block_init, bhp); } /** * nilfs_cpfile_find_checkpoint_block - find and get a buffer on cpfile * @cpfile: inode of cpfile * @start_cno: start checkpoint number (inclusive) * @end_cno: end checkpoint number (inclusive) * @cnop: place to store the next checkpoint number * @bhp: place to store a pointer to buffer_head struct * * Return Value: On success, it returns 0. On error, the following negative * error code is returned. * * %-ENOMEM - Insufficient memory available. * * %-EIO - I/O error * * %-ENOENT - no block exists in the range. */ static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile, __u64 start_cno, __u64 end_cno, __u64 *cnop, struct buffer_head **bhp) { unsigned long start, end, blkoff; int ret; if (unlikely(start_cno > end_cno)) return -ENOENT; start = nilfs_cpfile_get_blkoff(cpfile, start_cno); end = nilfs_cpfile_get_blkoff(cpfile, end_cno); ret = nilfs_mdt_find_block(cpfile, start, end, &blkoff, bhp); if (!ret) *cnop = (blkoff == start) ? start_cno : nilfs_cpfile_first_checkpoint_in_block(cpfile, blkoff); return ret; } static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile, __u64 cno) { return nilfs_mdt_delete_block(cpfile, nilfs_cpfile_get_blkoff(cpfile, cno)); } /** * nilfs_cpfile_get_checkpoint - get a checkpoint * @cpfile: inode of checkpoint file * @cno: checkpoint number * @create: create flag * @cpp: pointer to a checkpoint * @bhp: pointer to a buffer head * * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint * specified by @cno. A new checkpoint will be created if @cno is the current * checkpoint number and @create is nonzero. * * Return Value: On success, 0 is returned, and the checkpoint and the * buffer head of the buffer on which the checkpoint is located are stored in * the place pointed by @cpp and @bhp, respectively. On error, one of the * following negative error codes is returned. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient amount of memory available. * * %-ENOENT - No such checkpoint. * * %-EINVAL - invalid checkpoint. */ int nilfs_cpfile_get_checkpoint(struct inode *cpfile, __u64 cno, int create, struct nilfs_checkpoint **cpp, struct buffer_head **bhp) { struct buffer_head *header_bh, *cp_bh; struct nilfs_cpfile_header *header; struct nilfs_checkpoint *cp; void *kaddr; int ret; if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) || (cno < nilfs_mdt_cno(cpfile) && create))) return -EINVAL; down_write(&NILFS_MDT(cpfile)->mi_sem); ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); if (ret < 0) goto out_sem; ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh); if (ret < 0) goto out_header; kaddr = kmap(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); if (nilfs_checkpoint_invalid(cp)) { if (!create) { kunmap(cp_bh->b_page); brelse(cp_bh); ret = -ENOENT; goto out_header; } /* a newly-created checkpoint */ nilfs_checkpoint_clear_invalid(cp); if (!nilfs_cpfile_is_in_first(cpfile, cno)) nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, kaddr, 1); mark_buffer_dirty(cp_bh); kaddr = kmap_atomic(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_ncheckpoints, 1); kunmap_atomic(kaddr); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); } if (cpp != NULL) *cpp = cp; *bhp = cp_bh; out_header: brelse(header_bh); out_sem: up_write(&NILFS_MDT(cpfile)->mi_sem); return ret; } /** * nilfs_cpfile_put_checkpoint - put a checkpoint * @cpfile: inode of checkpoint file * @cno: checkpoint number * @bh: buffer head * * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint * specified by @cno. @bh must be the buffer head which has been returned by * a previous call to nilfs_cpfile_get_checkpoint() with @cno. */ void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno, struct buffer_head *bh) { kunmap(bh->b_page); brelse(bh); } /** * nilfs_cpfile_delete_checkpoints - delete checkpoints * @cpfile: inode of checkpoint file * @start: start checkpoint number * @end: end checkpoint numer * * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in * the period from @start to @end, excluding @end itself. The checkpoints * which have been already deleted are ignored. * * Return Value: On success, 0 is returned. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient amount of memory available. * * %-EINVAL - invalid checkpoints. */ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, __u64 start, __u64 end) { struct buffer_head *header_bh, *cp_bh; struct nilfs_cpfile_header *header; struct nilfs_checkpoint *cp; size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; __u64 cno; void *kaddr; unsigned long tnicps; int ret, ncps, nicps, nss, count, i; if (unlikely(start == 0 || start > end)) { nilfs_msg(cpfile->i_sb, KERN_ERR, "cannot delete checkpoints: invalid range [%llu, %llu)", (unsigned long long)start, (unsigned long long)end); return -EINVAL; } down_write(&NILFS_MDT(cpfile)->mi_sem); ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); if (ret < 0) goto out_sem; tnicps = 0; nss = 0; for (cno = start; cno < end; cno += ncps) { ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end); ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) { if (ret != -ENOENT) break; /* skip hole */ ret = 0; continue; } kaddr = kmap_atomic(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint( cpfile, cno, cp_bh, kaddr); nicps = 0; for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) { if (nilfs_checkpoint_snapshot(cp)) { nss++; } else if (!nilfs_checkpoint_invalid(cp)) { nilfs_checkpoint_set_invalid(cp); nicps++; } } if (nicps > 0) { tnicps += nicps; mark_buffer_dirty(cp_bh); nilfs_mdt_mark_dirty(cpfile); if (!nilfs_cpfile_is_in_first(cpfile, cno)) { count = nilfs_cpfile_block_sub_valid_checkpoints( cpfile, cp_bh, kaddr, nicps); if (count == 0) { /* make hole */ kunmap_atomic(kaddr); brelse(cp_bh); ret = nilfs_cpfile_delete_checkpoint_block( cpfile, cno); if (ret == 0) continue; nilfs_msg(cpfile->i_sb, KERN_ERR, "error %d deleting checkpoint block", ret); break; } } } kunmap_atomic(kaddr); brelse(cp_bh); } if (tnicps > 0) { kaddr = kmap_atomic(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); kunmap_atomic(kaddr); } brelse(header_bh); if (nss > 0) ret = -EBUSY; out_sem: up_write(&NILFS_MDT(cpfile)->mi_sem); return ret; } static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile, struct nilfs_checkpoint *cp, struct nilfs_cpinfo *ci) { ci->ci_flags = le32_to_cpu(cp->cp_flags); ci->ci_cno = le64_to_cpu(cp->cp_cno); ci->ci_create = le64_to_cpu(cp->cp_create); ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc); ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count); ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count); ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); } static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, void *buf, unsigned int cisz, size_t nci) { struct nilfs_checkpoint *cp; struct nilfs_cpinfo *ci = buf; struct buffer_head *bh; size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop; void *kaddr; int n, ret; int ncps, i; if (cno == 0) return -ENOENT; /* checkpoint number 0 is invalid */ down_read(&NILFS_MDT(cpfile)->mi_sem); for (n = 0; n < nci; cno += ncps) { ret = nilfs_cpfile_find_checkpoint_block( cpfile, cno, cur_cno - 1, &cno, &bh); if (ret < 0) { if (likely(ret == -ENOENT)) break; goto out; } ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); kaddr = kmap_atomic(bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) { if (!nilfs_checkpoint_invalid(cp)) { nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci); ci = (void *)ci + cisz; n++; } } kunmap_atomic(kaddr); brelse(bh); } ret = n; if (n > 0) { ci = (void *)ci - cisz; *cnop = ci->ci_cno + 1; } out: up_read(&NILFS_MDT(cpfile)->mi_sem); return ret; } static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, void *buf, unsigned int cisz, size_t nci) { struct buffer_head *bh; struct nilfs_cpfile_header *header; struct nilfs_checkpoint *cp; struct nilfs_cpinfo *ci = buf; __u64 curr = *cnop, next; unsigned long curr_blkoff, next_blkoff; void *kaddr; int n = 0, ret; down_read(&NILFS_MDT(cpfile)->mi_sem); if (curr == 0) { ret = nilfs_cpfile_get_header_block(cpfile, &bh); if (ret < 0) goto out; kaddr = kmap_atomic(bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); curr = le64_to_cpu(header->ch_snapshot_list.ssl_next); kunmap_atomic(kaddr); brelse(bh); if (curr == 0) { ret = 0; goto out; } } else if (unlikely(curr == ~(__u64)0)) { ret = 0; goto out; } curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr); ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh); if (unlikely(ret < 0)) { if (ret == -ENOENT) ret = 0; /* No snapshots (started from a hole block) */ goto out; } kaddr = kmap_atomic(bh->b_page); while (n < nci) { cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr); curr = ~(__u64)0; /* Terminator */ if (unlikely(nilfs_checkpoint_invalid(cp) || !nilfs_checkpoint_snapshot(cp))) break; nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, ci); ci = (void *)ci + cisz; n++; next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); if (next == 0) break; /* reach end of the snapshot list */ next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next); if (curr_blkoff != next_blkoff) { kunmap_atomic(kaddr); brelse(bh); ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, &bh); if (unlikely(ret < 0)) { WARN_ON(ret == -ENOENT); goto out; } kaddr = kmap_atomic(bh->b_page); } curr = next; curr_blkoff = next_blkoff; } kunmap_atomic(kaddr); brelse(bh); *cnop = curr; ret = n; out: up_read(&NILFS_MDT(cpfile)->mi_sem); return ret; } /** * nilfs_cpfile_get_cpinfo - * @cpfile: * @cno: * @ci: * @nci: */ ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode, void *buf, unsigned int cisz, size_t nci) { switch (mode) { case NILFS_CHECKPOINT: return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, buf, cisz, nci); case NILFS_SNAPSHOT: return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, buf, cisz, nci); default: return -EINVAL; } } /** * nilfs_cpfile_delete_checkpoint - * @cpfile: * @cno: */ int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno) { struct nilfs_cpinfo ci; __u64 tcno = cno; ssize_t nci; nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, sizeof(ci), 1); if (nci < 0) return nci; else if (nci == 0 || ci.ci_cno != cno) return -ENOENT; else if (nilfs_cpinfo_snapshot(&ci)) return -EBUSY; return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1); } static struct nilfs_snapshot_list * nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile, __u64 cno, struct buffer_head *bh, void *kaddr) { struct nilfs_cpfile_header *header; struct nilfs_checkpoint *cp; struct nilfs_snapshot_list *list; if (cno != 0) { cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); list = &cp->cp_snapshot_list; } else { header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); list = &header->ch_snapshot_list; } return list; } static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) { struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh; struct nilfs_cpfile_header *header; struct nilfs_checkpoint *cp; struct nilfs_snapshot_list *list; __u64 curr, prev; unsigned long curr_blkoff, prev_blkoff; void *kaddr; int ret; if (cno == 0) return -ENOENT; /* checkpoint number 0 is invalid */ down_write(&NILFS_MDT(cpfile)->mi_sem); ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) goto out_sem; kaddr = kmap_atomic(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); if (nilfs_checkpoint_invalid(cp)) { ret = -ENOENT; kunmap_atomic(kaddr); goto out_cp; } if (nilfs_checkpoint_snapshot(cp)) { ret = 0; kunmap_atomic(kaddr); goto out_cp; } kunmap_atomic(kaddr); ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); if (ret < 0) goto out_cp; kaddr = kmap_atomic(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); list = &header->ch_snapshot_list; curr_bh = header_bh; get_bh(curr_bh); curr = 0; curr_blkoff = 0; prev = le64_to_cpu(list->ssl_prev); while (prev > cno) { prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev); curr = prev; if (curr_blkoff != prev_blkoff) { kunmap_atomic(kaddr); brelse(curr_bh); ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &curr_bh); if (ret < 0) goto out_header; kaddr = kmap_atomic(curr_bh->b_page); } curr_blkoff = prev_blkoff; cp = nilfs_cpfile_block_get_checkpoint( cpfile, curr, curr_bh, kaddr); list = &cp->cp_snapshot_list; prev = le64_to_cpu(list->ssl_prev); } kunmap_atomic(kaddr); if (prev != 0) { ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, &prev_bh); if (ret < 0) goto out_curr; } else { prev_bh = header_bh; get_bh(prev_bh); } kaddr = kmap_atomic(curr_bh->b_page); list = nilfs_cpfile_block_get_snapshot_list( cpfile, curr, curr_bh, kaddr); list->ssl_prev = cpu_to_le64(cno); kunmap_atomic(kaddr); kaddr = kmap_atomic(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr); cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev); nilfs_checkpoint_set_snapshot(cp); kunmap_atomic(kaddr); kaddr = kmap_atomic(prev_bh->b_page); list = nilfs_cpfile_block_get_snapshot_list( cpfile, prev, prev_bh, kaddr); list->ssl_next = cpu_to_le64(cno); kunmap_atomic(kaddr); kaddr = kmap_atomic(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_nsnapshots, 1); kunmap_atomic(kaddr); mark_buffer_dirty(prev_bh); mark_buffer_dirty(curr_bh); mark_buffer_dirty(cp_bh); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); brelse(prev_bh); out_curr: brelse(curr_bh); out_header: brelse(header_bh); out_cp: brelse(cp_bh); out_sem: up_write(&NILFS_MDT(cpfile)->mi_sem); return ret; } static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) { struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh; struct nilfs_cpfile_header *header; struct nilfs_checkpoint *cp; struct nilfs_snapshot_list *list; __u64 next, prev; void *kaddr; int ret; if (cno == 0) return -ENOENT; /* checkpoint number 0 is invalid */ down_write(&NILFS_MDT(cpfile)->mi_sem); ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) goto out_sem; kaddr = kmap_atomic(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); if (nilfs_checkpoint_invalid(cp)) { ret = -ENOENT; kunmap_atomic(kaddr); goto out_cp; } if (!nilfs_checkpoint_snapshot(cp)) { ret = 0; kunmap_atomic(kaddr); goto out_cp; } list = &cp->cp_snapshot_list; next = le64_to_cpu(list->ssl_next); prev = le64_to_cpu(list->ssl_prev); kunmap_atomic(kaddr); ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); if (ret < 0) goto out_cp; if (next != 0) { ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, &next_bh); if (ret < 0) goto out_header; } else { next_bh = header_bh; get_bh(next_bh); } if (prev != 0) { ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, &prev_bh); if (ret < 0) goto out_next; } else { prev_bh = header_bh; get_bh(prev_bh); } kaddr = kmap_atomic(next_bh->b_page); list = nilfs_cpfile_block_get_snapshot_list( cpfile, next, next_bh, kaddr); list->ssl_prev = cpu_to_le64(prev); kunmap_atomic(kaddr); kaddr = kmap_atomic(prev_bh->b_page); list = nilfs_cpfile_block_get_snapshot_list( cpfile, prev, prev_bh, kaddr); list->ssl_next = cpu_to_le64(next); kunmap_atomic(kaddr); kaddr = kmap_atomic(cp_bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); cp->cp_snapshot_list.ssl_next = cpu_to_le64(0); cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0); nilfs_checkpoint_clear_snapshot(cp); kunmap_atomic(kaddr); kaddr = kmap_atomic(header_bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_nsnapshots, -1); kunmap_atomic(kaddr); mark_buffer_dirty(next_bh); mark_buffer_dirty(prev_bh); mark_buffer_dirty(cp_bh); mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); brelse(prev_bh); out_next: brelse(next_bh); out_header: brelse(header_bh); out_cp: brelse(cp_bh); out_sem: up_write(&NILFS_MDT(cpfile)->mi_sem); return ret; } /** * nilfs_cpfile_is_snapshot - * @cpfile: inode of checkpoint file * @cno: checkpoint number * * Description: * * Return Value: On success, 1 is returned if the checkpoint specified by * @cno is a snapshot, or 0 if not. On error, one of the following negative * error codes is returned. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient amount of memory available. * * %-ENOENT - No such checkpoint. */ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) { struct buffer_head *bh; struct nilfs_checkpoint *cp; void *kaddr; int ret; /* * CP number is invalid if it's zero or larger than the * largest existing one. */ if (cno == 0 || cno >= nilfs_mdt_cno(cpfile)) return -ENOENT; down_read(&NILFS_MDT(cpfile)->mi_sem); ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); if (ret < 0) goto out; kaddr = kmap_atomic(bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); if (nilfs_checkpoint_invalid(cp)) ret = -ENOENT; else ret = nilfs_checkpoint_snapshot(cp); kunmap_atomic(kaddr); brelse(bh); out: up_read(&NILFS_MDT(cpfile)->mi_sem); return ret; } /** * nilfs_cpfile_change_cpmode - change checkpoint mode * @cpfile: inode of checkpoint file * @cno: checkpoint number * @status: mode of checkpoint * * Description: nilfs_change_cpmode() changes the mode of the checkpoint * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT. * * Return Value: On success, 0 is returned. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient amount of memory available. * * %-ENOENT - No such checkpoint. */ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) { int ret; switch (mode) { case NILFS_CHECKPOINT: if (nilfs_checkpoint_is_mounted(cpfile->i_sb, cno)) /* * Current implementation does not have to protect * plain read-only mounts since they are exclusive * with a read/write mount and are protected from the * cleaner. */ ret = -EBUSY; else ret = nilfs_cpfile_clear_snapshot(cpfile, cno); return ret; case NILFS_SNAPSHOT: return nilfs_cpfile_set_snapshot(cpfile, cno); default: return -EINVAL; } } /** * nilfs_cpfile_get_stat - get checkpoint statistics * @cpfile: inode of checkpoint file * @stat: pointer to a structure of checkpoint statistics * * Description: nilfs_cpfile_get_stat() returns information about checkpoints. * * Return Value: On success, 0 is returned, and checkpoints information is * stored in the place pointed by @stat. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient amount of memory available. */ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) { struct buffer_head *bh; struct nilfs_cpfile_header *header; void *kaddr; int ret; down_read(&NILFS_MDT(cpfile)->mi_sem); ret = nilfs_cpfile_get_header_block(cpfile, &bh); if (ret < 0) goto out_sem; kaddr = kmap_atomic(bh->b_page); header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); cpstat->cs_cno = nilfs_mdt_cno(cpfile); cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints); cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots); kunmap_atomic(kaddr); brelse(bh); out_sem: up_read(&NILFS_MDT(cpfile)->mi_sem); return ret; } /** * nilfs_cpfile_read - read or get cpfile inode * @sb: super block instance * @cpsize: size of a checkpoint entry * @raw_inode: on-disk cpfile inode * @inodep: buffer to store the inode */ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, struct nilfs_inode *raw_inode, struct inode **inodep) { struct inode *cpfile; int err; if (cpsize > sb->s_blocksize) { nilfs_msg(sb, KERN_ERR, "too large checkpoint size: %zu bytes", cpsize); return -EINVAL; } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) { nilfs_msg(sb, KERN_ERR, "too small checkpoint size: %zu bytes", cpsize); return -EINVAL; } cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO); if (unlikely(!cpfile)) return -ENOMEM; if (!(cpfile->i_state & I_NEW)) goto out; err = nilfs_mdt_init(cpfile, NILFS_MDT_GFP, 0); if (err) goto failed; nilfs_mdt_set_entry_size(cpfile, cpsize, sizeof(struct nilfs_cpfile_header)); err = nilfs_read_inode_common(cpfile, raw_inode); if (err) goto failed; unlock_new_inode(cpfile); out: *inodep = cpfile; return 0; failed: iget_failed(cpfile); return err; } |
10 57 57 57 57 57 57 33 57 56 57 51 57 57 57 66 66 66 66 85 65 66 58 30 24 1 84 84 84 14 84 72 72 66 66 2 66 7 7 15 85 84 66 21 85 21 137 66 4 133 3 1 111 99 133 46 137 559 561 31 31 19 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 | /* Request a key from userspace * * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * See Documentation/security/keys/request-key.rst */ #include <linux/module.h> #include <linux/sched.h> #include <linux/kmod.h> #include <linux/err.h> #include <linux/keyctl.h> #include <linux/slab.h> #include "internal.h" #include <keys/request_key_auth-type.h> #define key_negative_timeout 60 /* default timeout on a negative key's existence */ /** * complete_request_key - Complete the construction of a key. * @auth_key: The authorisation key. * @error: The success or failute of the construction. * * Complete the attempt to construct a key. The key will be negated * if an error is indicated. The authorisation key will be revoked * unconditionally. */ void complete_request_key(struct key *authkey, int error) { struct request_key_auth *rka = get_request_key_auth(authkey); struct key *key = rka->target_key; kenter("%d{%d},%d", authkey->serial, key->serial, error); if (error < 0) key_negate_and_link(key, key_negative_timeout, NULL, authkey); else key_revoke(authkey); } EXPORT_SYMBOL(complete_request_key); /* * Initialise a usermode helper that is going to have a specific session * keyring. * * This is called in context of freshly forked kthread before kernel_execve(), * so we can simply install the desired session_keyring at this point. */ static int umh_keys_init(struct subprocess_info *info, struct cred *cred) { struct key *keyring = info->data; return install_session_keyring_to_cred(cred, keyring); } /* * Clean up a usermode helper with session keyring. */ static void umh_keys_cleanup(struct subprocess_info *info) { struct key *keyring = info->data; key_put(keyring); } /* * Call a usermode helper with a specific session keyring. */ static int call_usermodehelper_keys(const char *path, char **argv, char **envp, struct key *session_keyring, int wait) { struct subprocess_info *info; info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL, umh_keys_init, umh_keys_cleanup, session_keyring); if (!info) return -ENOMEM; key_get(session_keyring); return call_usermodehelper_exec(info, wait); } /* * Request userspace finish the construction of a key * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring>" */ static int call_sbin_request_key(struct key *authkey, void *aux) { static char const request_key[] = "/sbin/request-key"; struct request_key_auth *rka = get_request_key_auth(authkey); const struct cred *cred = current_cred(); key_serial_t prkey, sskey; struct key *key = rka->target_key, *keyring, *session; char *argv[9], *envp[3], uid_str[12], gid_str[12]; char key_str[12], keyring_str[3][12]; char desc[20]; int ret, i; kenter("{%d},{%d},%s", key->serial, authkey->serial, rka->op); ret = install_user_keyrings(); if (ret < 0) goto error_alloc; /* allocate a new session keyring */ sprintf(desc, "_req.%u", key->serial); cred = get_current_cred(); keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); put_cred(cred); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error_alloc; } /* attach the auth key to the session keyring */ ret = key_link(keyring, authkey); if (ret < 0) goto error_link; /* record the UID and GID */ sprintf(uid_str, "%d", from_kuid(&init_user_ns, cred->fsuid)); sprintf(gid_str, "%d", from_kgid(&init_user_ns, cred->fsgid)); /* we say which key is under construction */ sprintf(key_str, "%d", key->serial); /* we specify the process's default keyrings */ sprintf(keyring_str[0], "%d", cred->thread_keyring ? cred->thread_keyring->serial : 0); prkey = 0; if (cred->process_keyring) prkey = cred->process_keyring->serial; sprintf(keyring_str[1], "%d", prkey); rcu_read_lock(); session = rcu_dereference(cred->session_keyring); if (!session) session = cred->user->session_keyring; sskey = session->serial; rcu_read_unlock(); sprintf(keyring_str[2], "%d", sskey); /* set up a minimal environment */ i = 0; envp[i++] = "HOME=/"; envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; /* set up the argument list */ i = 0; argv[i++] = (char *)request_key; argv[i++] = (char *)rka->op; argv[i++] = key_str; argv[i++] = uid_str; argv[i++] = gid_str; argv[i++] = keyring_str[0]; argv[i++] = keyring_str[1]; argv[i++] = keyring_str[2]; argv[i] = NULL; /* do it */ ret = call_usermodehelper_keys(request_key, argv, envp, keyring, UMH_WAIT_PROC); kdebug("usermode -> 0x%x", ret); if (ret >= 0) { /* ret is the exit/wait code */ if (test_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags) || key_validate(key) < 0) ret = -ENOKEY; else /* ignore any errors from userspace if the key was * instantiated */ ret = 0; } error_link: key_put(keyring); error_alloc: complete_request_key(authkey, ret); kleave(" = %d", ret); return ret; } /* * Call out to userspace for key construction. * * Program failure is ignored in favour of key status. */ static int construct_key(struct key *key, const void *callout_info, size_t callout_len, void *aux, struct key *dest_keyring) { request_key_actor_t actor; struct key *authkey; int ret; kenter("%d,%p,%zu,%p", key->serial, callout_info, callout_len, aux); /* allocate an authorisation key */ authkey = request_key_auth_new(key, "create", callout_info, callout_len, dest_keyring); if (IS_ERR(authkey)) return PTR_ERR(authkey); /* Make the call */ actor = call_sbin_request_key; if (key->type->request_key) actor = key->type->request_key; ret = actor(authkey, aux); /* check that the actor called complete_request_key() prior to * returning an error */ WARN_ON(ret < 0 && !test_bit(KEY_FLAG_REVOKED, &authkey->flags)); key_put(authkey); kleave(" = %d", ret); return ret; } /* * Get the appropriate destination keyring for the request. * * The keyring selected is returned with an extra reference upon it which the * caller must release. */ static int construct_get_dest_keyring(struct key **_dest_keyring) { struct request_key_auth *rka; const struct cred *cred = current_cred(); struct key *dest_keyring = *_dest_keyring, *authkey; int ret; kenter("%p", dest_keyring); /* find the appropriate keyring */ if (dest_keyring) { /* the caller supplied one */ key_get(dest_keyring); } else { bool do_perm_check = true; /* use a default keyring; falling through the cases until we * find one that we actually have */ switch (cred->jit_keyring) { case KEY_REQKEY_DEFL_DEFAULT: case KEY_REQKEY_DEFL_REQUESTOR_KEYRING: if (cred->request_key_auth) { authkey = cred->request_key_auth; down_read(&authkey->sem); rka = get_request_key_auth(authkey); if (!test_bit(KEY_FLAG_REVOKED, &authkey->flags)) dest_keyring = key_get(rka->dest_keyring); up_read(&authkey->sem); if (dest_keyring) { do_perm_check = false; break; } } case KEY_REQKEY_DEFL_THREAD_KEYRING: dest_keyring = key_get(cred->thread_keyring); if (dest_keyring) break; case KEY_REQKEY_DEFL_PROCESS_KEYRING: dest_keyring = key_get(cred->process_keyring); if (dest_keyring) break; case KEY_REQKEY_DEFL_SESSION_KEYRING: rcu_read_lock(); dest_keyring = key_get( rcu_dereference(cred->session_keyring)); rcu_read_unlock(); if (dest_keyring) break; case KEY_REQKEY_DEFL_USER_SESSION_KEYRING: dest_keyring = key_get(cred->user->session_keyring); break; case KEY_REQKEY_DEFL_USER_KEYRING: dest_keyring = key_get(cred->user->uid_keyring); break; case KEY_REQKEY_DEFL_GROUP_KEYRING: default: BUG(); } /* * Require Write permission on the keyring. This is essential * because the default keyring may be the session keyring, and * joining a keyring only requires Search permission. * * However, this check is skipped for the "requestor keyring" so * that /sbin/request-key can itself use request_key() to add * keys to the original requestor's destination keyring. */ if (dest_keyring && do_perm_check) { ret = key_permission(make_key_ref(dest_keyring, 1), KEY_NEED_WRITE); if (ret) { key_put(dest_keyring); return ret; } } } *_dest_keyring = dest_keyring; kleave(" [dk %d]", key_serial(dest_keyring)); return 0; } /* * Allocate a new key in under-construction state and attempt to link it in to * the requested keyring. * * May return a key that's already under construction instead if there was a * race between two thread calling request_key(). */ static int construct_alloc_key(struct keyring_search_context *ctx, struct key *dest_keyring, unsigned long flags, struct key_user *user, struct key **_key) { struct assoc_array_edit *edit; struct key *key; key_perm_t perm; key_ref_t key_ref; int ret; kenter("%s,%s,,,", ctx->index_key.type->name, ctx->index_key.description); *_key = NULL; mutex_lock(&user->cons_lock); perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR; perm |= KEY_USR_VIEW; if (ctx->index_key.type->read) perm |= KEY_POS_READ; if (ctx->index_key.type == &key_type_keyring || ctx->index_key.type->update) perm |= KEY_POS_WRITE; key = key_alloc(ctx->index_key.type, ctx->index_key.description, ctx->cred->fsuid, ctx->cred->fsgid, ctx->cred, perm, flags, NULL); if (IS_ERR(key)) goto alloc_failed; set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags); if (dest_keyring) { ret = __key_link_begin(dest_keyring, &ctx->index_key, &edit); if (ret < 0) goto link_prealloc_failed; } /* attach the key to the destination keyring under lock, but we do need * to do another check just in case someone beat us to it whilst we * waited for locks */ mutex_lock(&key_construction_mutex); key_ref = search_process_keyrings(ctx); if (!IS_ERR(key_ref)) goto key_already_present; if (dest_keyring) __key_link(key, &edit); mutex_unlock(&key_construction_mutex); if (dest_keyring) __key_link_end(dest_keyring, &ctx->index_key, edit); mutex_unlock(&user->cons_lock); *_key = key; kleave(" = 0 [%d]", key_serial(key)); return 0; /* the key is now present - we tell the caller that we found it by * returning -EINPROGRESS */ key_already_present: key_put(key); mutex_unlock(&key_construction_mutex); key = key_ref_to_ptr(key_ref); if (dest_keyring) { ret = __key_link_check_live_key(dest_keyring, key); if (ret == 0) __key_link(key, &edit); __key_link_end(dest_keyring, &ctx->index_key, edit); if (ret < 0) goto link_check_failed; } mutex_unlock(&user->cons_lock); *_key = key; kleave(" = -EINPROGRESS [%d]", key_serial(key)); return -EINPROGRESS; link_check_failed: mutex_unlock(&user->cons_lock); key_put(key); kleave(" = %d [linkcheck]", ret); return ret; link_prealloc_failed: mutex_unlock(&user->cons_lock); key_put(key); kleave(" = %d [prelink]", ret); return ret; alloc_failed: mutex_unlock(&user->cons_lock); kleave(" = %ld", PTR_ERR(key)); return PTR_ERR(key); } /* * Commence key construction. */ static struct key *construct_key_and_link(struct keyring_search_context *ctx, const char *callout_info, size_t callout_len, void *aux, struct key *dest_keyring, unsigned long flags) { struct key_user *user; struct key *key; int ret; kenter(""); if (ctx->index_key.type == &key_type_keyring) return ERR_PTR(-EPERM); ret = construct_get_dest_keyring(&dest_keyring); if (ret) goto error; user = key_user_lookup(current_fsuid()); if (!user) { ret = -ENOMEM; goto error_put_dest_keyring; } ret = construct_alloc_key(ctx, dest_keyring, flags, user, &key); key_user_put(user); if (ret == 0) { ret = construct_key(key, callout_info, callout_len, aux, dest_keyring); if (ret < 0) { kdebug("cons failed"); goto construction_failed; } } else if (ret == -EINPROGRESS) { ret = 0; } else { goto error_put_dest_keyring; } key_put(dest_keyring); kleave(" = key %d", key_serial(key)); return key; construction_failed: key_negate_and_link(key, key_negative_timeout, NULL, NULL); key_put(key); error_put_dest_keyring: key_put(dest_keyring); error: kleave(" = %d", ret); return ERR_PTR(ret); } /** * request_key_and_link - Request a key and cache it in a keyring. * @type: The type of key we want. * @description: The searchable description of the key. * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. * @dest_keyring: Where to cache the key. * @flags: Flags to key_alloc(). * * A key matching the specified criteria is searched for in the process's * keyrings and returned with its usage count incremented if found. Otherwise, * if callout_info is not NULL, a key will be allocated and some service * (probably in userspace) will be asked to instantiate it. * * If successfully found or created, the key will be linked to the destination * keyring if one is provided. * * Returns a pointer to the key if successful; -EACCES, -ENOKEY, -EKEYREVOKED * or -EKEYEXPIRED if an inaccessible, negative, revoked or expired key was * found; -ENOKEY if no key was found and no @callout_info was given; -EDQUOT * if insufficient key quota was available to create a new key; or -ENOMEM if * insufficient memory was available. * * If the returned key was created, then it may still be under construction, * and wait_for_key_construction() should be used to wait for that to complete. */ struct key *request_key_and_link(struct key_type *type, const char *description, const void *callout_info, size_t callout_len, void *aux, struct key *dest_keyring, unsigned long flags) { struct keyring_search_context ctx = { .index_key.type = type, .index_key.description = description, .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_DO_STATE_CHECK | KEYRING_SEARCH_SKIP_EXPIRED), }; struct key *key; key_ref_t key_ref; int ret; kenter("%s,%s,%p,%zu,%p,%p,%lx", ctx.index_key.type->name, ctx.index_key.description, callout_info, callout_len, aux, dest_keyring, flags); if (type->match_preparse) { ret = type->match_preparse(&ctx.match_data); if (ret < 0) { key = ERR_PTR(ret); goto error; } } /* search all the process keyrings for a key */ key_ref = search_process_keyrings(&ctx); if (!IS_ERR(key_ref)) { key = key_ref_to_ptr(key_ref); if (dest_keyring) { ret = key_link(dest_keyring, key); if (ret < 0) { key_put(key); key = ERR_PTR(ret); goto error_free; } } } else if (PTR_ERR(key_ref) != -EAGAIN) { key = ERR_CAST(key_ref); } else { /* the search failed, but the keyrings were searchable, so we * should consult userspace if we can */ key = ERR_PTR(-ENOKEY); if (!callout_info) goto error_free; key = construct_key_and_link(&ctx, callout_info, callout_len, aux, dest_keyring, flags); } error_free: if (type->match_free) type->match_free(&ctx.match_data); error: kleave(" = %p", key); return key; } /** * wait_for_key_construction - Wait for construction of a key to complete * @key: The key being waited for. * @intr: Whether to wait interruptibly. * * Wait for a key to finish being constructed. * * Returns 0 if successful; -ERESTARTSYS if the wait was interrupted; -ENOKEY * if the key was negated; or -EKEYREVOKED or -EKEYEXPIRED if the key was * revoked or expired. */ int wait_for_key_construction(struct key *key, bool intr) { int ret; ret = wait_on_bit(&key->flags, KEY_FLAG_USER_CONSTRUCT, intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret) return -ERESTARTSYS; ret = key_read_state(key); if (ret < 0) return ret; return key_validate(key); } EXPORT_SYMBOL(wait_for_key_construction); /** * request_key - Request a key and wait for construction * @type: Type of key. * @description: The searchable description of the key. * @callout_info: The data to pass to the instantiation upcall (or NULL). * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found, new keys are always allocated in the user's quota, * the callout_info must be a NUL-terminated string and no auxiliary data can * be passed. * * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ struct key *request_key(struct key_type *type, const char *description, const char *callout_info) { struct key *key; size_t callout_len = 0; int ret; if (callout_info) callout_len = strlen(callout_info); key = request_key_and_link(type, description, callout_info, callout_len, NULL, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key)) { ret = wait_for_key_construction(key, false); if (ret < 0) { key_put(key); return ERR_PTR(ret); } } return key; } EXPORT_SYMBOL(request_key); /** * request_key_with_auxdata - Request a key with auxiliary data for the upcaller * @type: The type of key we want. * @description: The searchable description of the key. * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found and new keys are always allocated in the user's quota. * * Furthermore, it then works as wait_for_key_construction() to wait for the * completion of keys undergoing construction with a non-interruptible wait. */ struct key *request_key_with_auxdata(struct key_type *type, const char *description, const void *callout_info, size_t callout_len, void *aux) { struct key *key; int ret; key = request_key_and_link(type, description, callout_info, callout_len, aux, NULL, KEY_ALLOC_IN_QUOTA); if (!IS_ERR(key)) { ret = wait_for_key_construction(key, false); if (ret < 0) { key_put(key); return ERR_PTR(ret); } } return key; } EXPORT_SYMBOL(request_key_with_auxdata); /* * request_key_async - Request a key (allow async construction) * @type: Type of key. * @description: The searchable description of the key. * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found, new keys are always allocated in the user's quota and * no auxiliary data can be passed. * * The caller should call wait_for_key_construction() to wait for the * completion of the returned key if it is still undergoing construction. */ struct key *request_key_async(struct key_type *type, const char *description, const void *callout_info, size_t callout_len) { return request_key_and_link(type, description, callout_info, callout_len, NULL, NULL, KEY_ALLOC_IN_QUOTA); } EXPORT_SYMBOL(request_key_async); /* * request a key with auxiliary data for the upcaller (allow async construction) * @type: Type of key. * @description: The searchable description of the key. * @callout_info: The data to pass to the instantiation upcall (or NULL). * @callout_len: The length of callout_info. * @aux: Auxiliary data for the upcall. * * As for request_key_and_link() except that it does not add the returned key * to a keyring if found and new keys are always allocated in the user's quota. * * The caller should call wait_for_key_construction() to wait for the * completion of the returned key if it is still undergoing construction. */ struct key *request_key_async_with_auxdata(struct key_type *type, const char *description, const void *callout_info, size_t callout_len, void *aux) { return request_key_and_link(type, description, callout_info, callout_len, aux, NULL, KEY_ALLOC_IN_QUOTA); } EXPORT_SYMBOL(request_key_async_with_auxdata); |
55 55 55 55 55 55 46 55 37 44 31 31 55 55 55 55 55 55 55 55 55 5 5 5 56 55 55 2 2 56 55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 | /* * linux/fs/9p/trans_rdma.c * * RDMA transport layer based on the trans_fd.c implementation. * * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com> * Copyright (C) 2006 by Russ Cox <rsc@swtch.com> * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to: * Free Software Foundation * 51 Franklin Street, Fifth Floor * Boston, MA 02111-1301 USA * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/module.h> #include <linux/net.h> #include <linux/ipv6.h> #include <linux/kthread.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/un.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/idr.h> #include <linux/file.h> #include <linux/parser.h> #include <linux/semaphore.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #define P9_PORT 5640 #define P9_RDMA_SQ_DEPTH 32 #define P9_RDMA_RQ_DEPTH 32 #define P9_RDMA_SEND_SGE 4 #define P9_RDMA_RECV_SGE 4 #define P9_RDMA_IRD 0 #define P9_RDMA_ORD 0 #define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ #define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */ /** * struct p9_trans_rdma - RDMA transport instance * * @state: tracks the transport state machine for connection setup and tear down * @cm_id: The RDMA CM ID * @pd: Protection Domain pointer * @qp: Queue Pair pointer * @cq: Completion Queue pointer * @timeout: Number of uSecs to wait for connection management events * @privport: Whether a privileged port may be used * @port: The port to use * @sq_depth: The depth of the Send Queue * @sq_sem: Semaphore for the SQ * @rq_depth: The depth of the Receive Queue. * @rq_sem: Semaphore for the RQ * @excess_rc : Amount of posted Receive Contexts without a pending request. * See rdma_request() * @addr: The remote peer's address * @req_lock: Protects the active request list * @cm_done: Completion event for connection management tracking */ struct p9_trans_rdma { enum { P9_RDMA_INIT, P9_RDMA_ADDR_RESOLVED, P9_RDMA_ROUTE_RESOLVED, P9_RDMA_CONNECTED, P9_RDMA_FLUSHING, P9_RDMA_CLOSING, P9_RDMA_CLOSED, } state; struct rdma_cm_id *cm_id; struct ib_pd *pd; struct ib_qp *qp; struct ib_cq *cq; long timeout; bool privport; u16 port; int sq_depth; struct semaphore sq_sem; int rq_depth; struct semaphore rq_sem; atomic_t excess_rc; struct sockaddr_in addr; spinlock_t req_lock; struct completion cm_done; }; /** * p9_rdma_context - Keeps track of in-process WR * * @busa: Bus address to unmap when the WR completes * @req: Keeps track of requests (send) * @rc: Keepts track of replies (receive) */ struct p9_rdma_req; struct p9_rdma_context { struct ib_cqe cqe; dma_addr_t busa; union { struct p9_req_t *req; struct p9_fcall rc; }; }; /** * p9_rdma_opts - Collection of mount options * @port: port of connection * @sq_depth: The requested depth of the SQ. This really doesn't need * to be any deeper than the number of threads used in the client * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth * @timeout: Time to wait in msecs for CM events */ struct p9_rdma_opts { short port; bool privport; int sq_depth; int rq_depth; long timeout; }; /* * Option Parsing (code inspired by NFS code) */ enum { /* Options that take integer arguments */ Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, /* Options that take no argument */ Opt_privport, Opt_err, }; static match_table_t tokens = { {Opt_port, "port=%u"}, {Opt_sq_depth, "sq=%u"}, {Opt_rq_depth, "rq=%u"}, {Opt_timeout, "timeout=%u"}, {Opt_privport, "privport"}, {Opt_err, NULL}, }; static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt) { struct p9_trans_rdma *rdma = clnt->trans; if (rdma->port != P9_PORT) seq_printf(m, ",port=%u", rdma->port); if (rdma->sq_depth != P9_RDMA_SQ_DEPTH) seq_printf(m, ",sq=%u", rdma->sq_depth); if (rdma->rq_depth != P9_RDMA_RQ_DEPTH) seq_printf(m, ",rq=%u", rdma->rq_depth); if (rdma->timeout != P9_RDMA_TIMEOUT) seq_printf(m, ",timeout=%lu", rdma->timeout); if (rdma->privport) seq_puts(m, ",privport"); return 0; } /** * parse_opts - parse mount options into rdma options structure * @params: options string passed from mount * @opts: rdma transport-specific structure to parse options into * * Returns 0 upon success, -ERRNO upon failure */ static int parse_opts(char *params, struct p9_rdma_opts *opts) { char *p; substring_t args[MAX_OPT_ARGS]; int option; char *options, *tmp_options; opts->port = P9_PORT; opts->sq_depth = P9_RDMA_SQ_DEPTH; opts->rq_depth = P9_RDMA_RQ_DEPTH; opts->timeout = P9_RDMA_TIMEOUT; opts->privport = false; if (!params) return 0; tmp_options = kstrdup(params, GFP_KERNEL); if (!tmp_options) { p9_debug(P9_DEBUG_ERROR, "failed to allocate copy of option string\n"); return -ENOMEM; } options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { int token; int r; if (!*p) continue; token = match_token(p, tokens, args); if ((token != Opt_err) && (token != Opt_privport)) { r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); continue; } } switch (token) { case Opt_port: opts->port = option; break; case Opt_sq_depth: opts->sq_depth = option; break; case Opt_rq_depth: opts->rq_depth = option; break; case Opt_timeout: opts->timeout = option; break; case Opt_privport: opts->privport = true; break; default: continue; } } /* RQ must be at least as large as the SQ */ opts->rq_depth = max(opts->rq_depth, opts->sq_depth); kfree(tmp_options); return 0; } static int p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct p9_client *c = id->context; struct p9_trans_rdma *rdma = c->trans; switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: BUG_ON(rdma->state != P9_RDMA_INIT); rdma->state = P9_RDMA_ADDR_RESOLVED; break; case RDMA_CM_EVENT_ROUTE_RESOLVED: BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED); rdma->state = P9_RDMA_ROUTE_RESOLVED; break; case RDMA_CM_EVENT_ESTABLISHED: BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED); rdma->state = P9_RDMA_CONNECTED; break; case RDMA_CM_EVENT_DISCONNECTED: if (rdma) rdma->state = P9_RDMA_CLOSED; c->status = Disconnected; break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: break; case RDMA_CM_EVENT_ADDR_CHANGE: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_MULTICAST_JOIN: case RDMA_CM_EVENT_MULTICAST_ERROR: case RDMA_CM_EVENT_REJECTED: case RDMA_CM_EVENT_CONNECT_REQUEST: case RDMA_CM_EVENT_CONNECT_RESPONSE: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_UNREACHABLE: c->status = Disconnected; rdma_disconnect(rdma->cm_id); break; default: BUG(); } complete(&rdma->cm_done); return 0; } static void recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct p9_client *client = cq->cq_context; struct p9_trans_rdma *rdma = client->trans; struct p9_rdma_context *c = container_of(wc->wr_cqe, struct p9_rdma_context, cqe); struct p9_req_t *req; int err = 0; int16_t tag; req = NULL; ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, DMA_FROM_DEVICE); if (wc->status != IB_WC_SUCCESS) goto err_out; c->rc.size = wc->byte_len; err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1); if (err) goto err_out; req = p9_tag_lookup(client, tag); if (!req) goto err_out; /* Check that we have not yet received a reply for this request. */ if (unlikely(req->rc.sdata)) { pr_err("Duplicate reply for request %d", tag); goto err_out; } req->rc.size = c->rc.size; req->rc.sdata = c->rc.sdata; p9_client_cb(client, req, REQ_STATUS_RCVD); out: up(&rdma->rq_sem); kfree(c); return; err_out: p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", req, err, wc->status); rdma->state = P9_RDMA_FLUSHING; client->status = Disconnected; goto out; } static void send_done(struct ib_cq *cq, struct ib_wc *wc) { struct p9_client *client = cq->cq_context; struct p9_trans_rdma *rdma = client->trans; struct p9_rdma_context *c = container_of(wc->wr_cqe, struct p9_rdma_context, cqe); ib_dma_unmap_single(rdma->cm_id->device, c->busa, c->req->tc.size, DMA_TO_DEVICE); up(&rdma->sq_sem); p9_req_put(c->req); kfree(c); } static void qp_event_handler(struct ib_event *event, void *context) { p9_debug(P9_DEBUG_ERROR, "QP event %d context %p\n", event->event, context); } static void rdma_destroy_trans(struct p9_trans_rdma *rdma) { if (!rdma) return; if (rdma->qp && !IS_ERR(rdma->qp)) ib_destroy_qp(rdma->qp); if (rdma->pd && !IS_ERR(rdma->pd)) ib_dealloc_pd(rdma->pd); if (rdma->cq && !IS_ERR(rdma->cq)) ib_free_cq(rdma->cq); if (rdma->cm_id && !IS_ERR(rdma->cm_id)) rdma_destroy_id(rdma->cm_id); kfree(rdma); } static int post_recv(struct p9_client *client, struct p9_rdma_context *c) { struct p9_trans_rdma *rdma = client->trans; struct ib_recv_wr wr; struct ib_sge sge; c->busa = ib_dma_map_single(rdma->cm_id->device, c->rc.sdata, client->msize, DMA_FROM_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) goto error; c->cqe.done = recv_done; sge.addr = c->busa; sge.length = client->msize; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; wr.wr_cqe = &c->cqe; wr.sg_list = &sge; wr.num_sge = 1; return ib_post_recv(rdma->qp, &wr, NULL); error: p9_debug(P9_DEBUG_ERROR, "EIO\n"); return -EIO; } static int rdma_request(struct p9_client *client, struct p9_req_t *req) { struct p9_trans_rdma *rdma = client->trans; struct ib_send_wr wr; struct ib_sge sge; int err = 0; unsigned long flags; struct p9_rdma_context *c = NULL; struct p9_rdma_context *rpl_context = NULL; /* When an error occurs between posting the recv and the send, * there will be a receive context posted without a pending request. * Since there is no way to "un-post" it, we remember it and skip * post_recv() for the next request. * So here, * see if we are this `next request' and need to absorb an excess rc. * If yes, then drop and free our own, and do not recv_post(). **/ if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { /* Got one! */ p9_fcall_fini(&req->rc); req->rc.sdata = NULL; goto dont_need_post_recv; } else { /* We raced and lost. */ atomic_inc(&rdma->excess_rc); } } /* Allocate an fcall for the reply */ rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); if (!rpl_context) { err = -ENOMEM; goto recv_error; } rpl_context->rc.sdata = req->rc.sdata; /* * Post a receive buffer for this request. We need to ensure * there is a reply buffer available for every outstanding * request. A flushed request can result in no reply for an * outstanding request, so we must keep a count to avoid * overflowing the RQ. */ if (down_interruptible(&rdma->rq_sem)) { err = -EINTR; goto recv_error; } err = post_recv(client, rpl_context); if (err) { p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err); goto recv_error; } /* remove posted receive buffer from request structure */ req->rc.sdata = NULL; dont_need_post_recv: /* Post the request */ c = kmalloc(sizeof *c, GFP_NOFS); if (!c) { err = -ENOMEM; goto send_error; } c->req = req; c->busa = ib_dma_map_single(rdma->cm_id->device, c->req->tc.sdata, c->req->tc.size, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) { err = -EIO; goto send_error; } c->cqe.done = send_done; sge.addr = c->busa; sge.length = c->req->tc.size; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; wr.wr_cqe = &c->cqe; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; wr.sg_list = &sge; wr.num_sge = 1; if (down_interruptible(&rdma->sq_sem)) { err = -EINTR; goto send_error; } /* Mark request as `sent' *before* we actually send it, * because doing if after could erase the REQ_STATUS_RCVD * status in case of a very fast reply. */ req->status = REQ_STATUS_SENT; err = ib_post_send(rdma->qp, &wr, NULL); if (err) goto send_error; /* Success */ return 0; /* Handle errors that happened during or while preparing the send: */ send_error: req->status = REQ_STATUS_ERROR; kfree(c); p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err); /* Ach. * We did recv_post(), but not send. We have one recv_post in excess. */ atomic_inc(&rdma->excess_rc); return err; /* Handle errors that happened during or while preparing post_recv(): */ recv_error: kfree(rpl_context); spin_lock_irqsave(&rdma->req_lock, flags); if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) { rdma->state = P9_RDMA_CLOSING; spin_unlock_irqrestore(&rdma->req_lock, flags); rdma_disconnect(rdma->cm_id); } else spin_unlock_irqrestore(&rdma->req_lock, flags); return err; } static void rdma_close(struct p9_client *client) { struct p9_trans_rdma *rdma; if (!client) return; rdma = client->trans; if (!rdma) return; client->status = Disconnected; rdma_disconnect(rdma->cm_id); rdma_destroy_trans(rdma); } /** * alloc_rdma - Allocate and initialize the rdma transport structure * @opts: Mount options structure */ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) { struct p9_trans_rdma *rdma; rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL); if (!rdma) return NULL; rdma->port = opts->port; rdma->privport = opts->privport; rdma->sq_depth = opts->sq_depth; rdma->rq_depth = opts->rq_depth; rdma->timeout = opts->timeout; spin_lock_init(&rdma->req_lock); init_completion(&rdma->cm_done); sema_init(&rdma->sq_sem, rdma->sq_depth); sema_init(&rdma->rq_sem, rdma->rq_depth); atomic_set(&rdma->excess_rc, 0); return rdma; } static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) { /* Nothing to do here. * We will take care of it (if we have to) in rdma_cancelled() */ return 1; } /* A request has been fully flushed without a reply. * That means we have posted one buffer in excess. */ static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req) { struct p9_trans_rdma *rdma = client->trans; atomic_inc(&rdma->excess_rc); return 0; } static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma) { struct sockaddr_in cl = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; int port, err = -EINVAL; for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) { cl.sin_port = htons((ushort)port); err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl); if (err != -EADDRINUSE) break; } return err; } /** * rdma_create_trans - Transport method for creating a transport instance * @client: client instance * @addr: IP address string * @args: Mount options string */ static int rdma_create_trans(struct p9_client *client, const char *addr, char *args) { int err; struct p9_rdma_opts opts; struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; if (addr == NULL) return -EINVAL; /* Parse the transport specific mount options */ err = parse_opts(args, &opts); if (err < 0) return err; /* Create and initialize the RDMA transport structure */ rdma = alloc_rdma(&opts); if (!rdma) return -ENOMEM; /* Create the RDMA CM ID */ rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(rdma->cm_id)) goto error; /* Associate the client with the transport */ client->trans = rdma; /* Bind to a privileged port if we need to */ if (opts.privport) { err = p9_rdma_bind_privport(rdma); if (err < 0) { pr_err("%s (%d): problem binding to privport: %d\n", __func__, task_pid_nr(current), -err); goto error; } } /* Resolve the server's address */ rdma->addr.sin_family = AF_INET; rdma->addr.sin_addr.s_addr = in_aton(addr); rdma->addr.sin_port = htons(opts.port); err = rdma_resolve_addr(rdma->cm_id, NULL, (struct sockaddr *)&rdma->addr, rdma->timeout); if (err) goto error; err = wait_for_completion_interruptible(&rdma->cm_done); if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED)) goto error; /* Resolve the route to the server */ err = rdma_resolve_route(rdma->cm_id, rdma->timeout); if (err) goto error; err = wait_for_completion_interruptible(&rdma->cm_done); if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) goto error; /* Create the Completion Queue */ rdma->cq = ib_alloc_cq(rdma->cm_id->device, client, opts.sq_depth + opts.rq_depth + 1, 0, IB_POLL_SOFTIRQ); if (IS_ERR(rdma->cq)) goto error; /* Create the Protection Domain */ rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0); if (IS_ERR(rdma->pd)) goto error; /* Create the Queue Pair */ memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = client; qp_attr.cap.max_send_wr = opts.sq_depth; qp_attr.cap.max_recv_wr = opts.rq_depth; qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE; qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; qp_attr.send_cq = rdma->cq; qp_attr.recv_cq = rdma->cq; err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr); if (err) goto error; rdma->qp = rdma->cm_id->qp; /* Request a connection */ memset(&conn_param, 0, sizeof(conn_param)); conn_param.private_data = NULL; conn_param.private_data_len = 0; conn_param.responder_resources = P9_RDMA_IRD; conn_param.initiator_depth = P9_RDMA_ORD; err = rdma_connect(rdma->cm_id, &conn_param); if (err) goto error; err = wait_for_completion_interruptible(&rdma->cm_done); if (err || (rdma->state != P9_RDMA_CONNECTED)) goto error; client->status = Connected; return 0; error: rdma_destroy_trans(rdma); return -ENOTCONN; } static struct p9_trans_module p9_rdma_trans = { .name = "rdma", .maxsize = P9_RDMA_MAXSIZE, .def = 0, .owner = THIS_MODULE, .create = rdma_create_trans, .close = rdma_close, .request = rdma_request, .cancel = rdma_cancel, .cancelled = rdma_cancelled, .show_options = p9_rdma_show_options, }; /** * p9_trans_rdma_init - Register the 9P RDMA transport driver */ static int __init p9_trans_rdma_init(void) { v9fs_register_trans(&p9_rdma_trans); return 0; } static void __exit p9_trans_rdma_exit(void) { v9fs_unregister_trans(&p9_rdma_trans); } module_init(p9_trans_rdma_init); module_exit(p9_trans_rdma_exit); MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); MODULE_DESCRIPTION("RDMA Transport for 9P"); MODULE_LICENSE("Dual BSD/GPL"); |
23 23 23 23 66 43 43 43 66 23 23 23 23 23 10107 10107 10110 9977 10112 10115 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 | /* * AppArmor security module * * This file contains AppArmor mediation of files * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2010 Canonical Ltd. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation, version 2 of the * License. */ #include <linux/tty.h> #include <linux/fdtable.h> #include <linux/file.h> #include "include/apparmor.h" #include "include/audit.h" #include "include/cred.h" #include "include/file.h" #include "include/match.h" #include "include/net.h" #include "include/path.h" #include "include/policy.h" #include "include/label.h" static u32 map_mask_to_chr_mask(u32 mask) { u32 m = mask & PERMS_CHRS_MASK; if (mask & AA_MAY_GETATTR) m |= MAY_READ; if (mask & (AA_MAY_SETATTR | AA_MAY_CHMOD | AA_MAY_CHOWN)) m |= MAY_WRITE; return m; } /** * audit_file_mask - convert mask to permission string * @buffer: buffer to write string to (NOT NULL) * @mask: permission mask to convert */ static void audit_file_mask(struct audit_buffer *ab, u32 mask) { char str[10]; aa_perm_mask_to_str(str, sizeof(str), aa_file_perm_chrs, map_mask_to_chr_mask(mask)); audit_log_string(ab, str); } /** * file_audit_cb - call back for file specific audit fields * @ab: audit_buffer (NOT NULL) * @va: audit struct to audit values of (NOT NULL) */ static void file_audit_cb(struct audit_buffer *ab, void *va) { struct common_audit_data *sa = va; kuid_t fsuid = current_fsuid(); if (aad(sa)->request & AA_AUDIT_FILE_MASK) { audit_log_format(ab, " requested_mask="); audit_file_mask(ab, aad(sa)->request); } if (aad(sa)->denied & AA_AUDIT_FILE_MASK) { audit_log_format(ab, " denied_mask="); audit_file_mask(ab, aad(sa)->denied); } if (aad(sa)->request & AA_AUDIT_FILE_MASK) { audit_log_format(ab, " fsuid=%d", from_kuid(&init_user_ns, fsuid)); audit_log_format(ab, " ouid=%d", from_kuid(&init_user_ns, aad(sa)->fs.ouid)); } if (aad(sa)->peer) { audit_log_format(ab, " target="); aa_label_xaudit(ab, labels_ns(aad(sa)->label), aad(sa)->peer, FLAG_VIEW_SUBNS, GFP_ATOMIC); } else if (aad(sa)->fs.target) { audit_log_format(ab, " target="); audit_log_untrustedstring(ab, aad(sa)->fs.target); } } /** * aa_audit_file - handle the auditing of file operations * @profile: the profile being enforced (NOT NULL) * @perms: the permissions computed for the request (NOT NULL) * @op: operation being mediated * @request: permissions requested * @name: name of object being mediated (MAYBE NULL) * @target: name of target (MAYBE NULL) * @tlabel: target label (MAY BE NULL) * @ouid: object uid * @info: extra information message (MAYBE NULL) * @error: 0 if operation allowed else failure error code * * Returns: %0 or error on failure */ int aa_audit_file(struct aa_profile *profile, struct aa_perms *perms, const char *op, u32 request, const char *name, const char *target, struct aa_label *tlabel, kuid_t ouid, const char *info, int error) { int type = AUDIT_APPARMOR_AUTO; DEFINE_AUDIT_DATA(sa, LSM_AUDIT_DATA_TASK, op); sa.u.tsk = NULL; aad(&sa)->request = request; aad(&sa)->name = name; aad(&sa)->fs.target = target; aad(&sa)->peer = tlabel; aad(&sa)->fs.ouid = ouid; aad(&sa)->info = info; aad(&sa)->error = error; sa.u.tsk = NULL; if (likely(!aad(&sa)->error)) { u32 mask = perms->audit; if (unlikely(AUDIT_MODE(profile) == AUDIT_ALL)) mask = 0xffff; /* mask off perms that are not being force audited */ aad(&sa)->request &= mask; if (likely(!aad(&sa)->request)) return 0; type = AUDIT_APPARMOR_AUDIT; } else { /* only report permissions that were denied */ aad(&sa)->request = aad(&sa)->request & ~perms->allow; AA_BUG(!aad(&sa)->request); if (aad(&sa)->request & perms->kill) type = AUDIT_APPARMOR_KILL; /* quiet known rejects, assumes quiet and kill do not overlap */ if ((aad(&sa)->request & perms->quiet) && AUDIT_MODE(profile) != AUDIT_NOQUIET && AUDIT_MODE(profile) != AUDIT_ALL) aad(&sa)->request &= ~perms->quiet; if (!aad(&sa)->request) return aad(&sa)->error; } aad(&sa)->denied = aad(&sa)->request & ~perms->allow; return aa_audit(type, profile, &sa, file_audit_cb); } /** * is_deleted - test if a file has been completely unlinked * @dentry: dentry of file to test for deletion (NOT NULL) * * Returns: %1 if deleted else %0 */ static inline bool is_deleted(struct dentry *dentry) { if (d_unlinked(dentry) && d_backing_inode(dentry)->i_nlink == 0) return 1; return 0; } static int path_name(const char *op, struct aa_label *label, const struct path *path, int flags, char *buffer, const char **name, struct path_cond *cond, u32 request) { struct aa_profile *profile; const char *info = NULL; int error; error = aa_path_name(path, flags, buffer, name, &info, labels_profile(label)->disconnected); if (error) { fn_for_each_confined(label, profile, aa_audit_file(profile, &nullperms, op, request, *name, NULL, NULL, cond->uid, info, error)); return error; } return 0; } /** * map_old_perms - map old file perms layout to the new layout * @old: permission set in old mapping * * Returns: new permission mapping */ static u32 map_old_perms(u32 old) { u32 new = old & 0xf; if (old & MAY_READ) new |= AA_MAY_GETATTR | AA_MAY_OPEN; if (old & MAY_WRITE) new |= AA_MAY_SETATTR | AA_MAY_CREATE | AA_MAY_DELETE | AA_MAY_CHMOD | AA_MAY_CHOWN | AA_MAY_OPEN; if (old & 0x10) new |= AA_MAY_LINK; /* the old mapping lock and link_subset flags where overlaid * and use was determined by part of a pair that they were in */ if (old & 0x20) new |= AA_MAY_LOCK | AA_LINK_SUBSET; if (old & 0x40) /* AA_EXEC_MMAP */ new |= AA_EXEC_MMAP; return new; } /** * aa_compute_fperms - convert dfa compressed perms to internal perms * @dfa: dfa to compute perms for (NOT NULL) * @state: state in dfa * @cond: conditions to consider (NOT NULL) * * TODO: convert from dfa + state to permission entry, do computation conversion * at load time. * * Returns: computed permission set */ struct aa_perms aa_compute_fperms(struct aa_dfa *dfa, unsigned int state, struct path_cond *cond) { /* FIXME: change over to new dfa format * currently file perms are encoded in the dfa, new format * splits the permissions from the dfa. This mapping can be * done at profile load */ struct aa_perms perms = { }; if (uid_eq(current_fsuid(), cond->uid)) { perms.allow = map_old_perms(dfa_user_allow(dfa, state)); perms.audit = map_old_perms(dfa_user_audit(dfa, state)); perms.quiet = map_old_perms(dfa_user_quiet(dfa, state)); perms.xindex = dfa_user_xindex(dfa, state); } else { perms.allow = map_old_perms(dfa_other_allow(dfa, state)); perms.audit = map_old_perms(dfa_other_audit(dfa, state)); perms.quiet = map_old_perms(dfa_other_quiet(dfa, state)); perms.xindex = dfa_other_xindex(dfa, state); } perms.allow |= AA_MAY_GETATTR; /* change_profile wasn't determined by ownership in old mapping */ if (ACCEPT_TABLE(dfa)[state] & 0x80000000) perms.allow |= AA_MAY_CHANGE_PROFILE; if (ACCEPT_TABLE(dfa)[state] & 0x40000000) perms.allow |= AA_MAY_ONEXEC; return perms; } /** * aa_str_perms - find permission that match @name * @dfa: to match against (MAYBE NULL) * @state: state to start matching in * @name: string to match against dfa (NOT NULL) * @cond: conditions to consider for permission set computation (NOT NULL) * @perms: Returns - the permissions found when matching @name * * Returns: the final state in @dfa when beginning @start and walking @name */ unsigned int aa_str_perms(struct aa_dfa *dfa, unsigned int start, const char *name, struct path_cond *cond, struct aa_perms *perms) { unsigned int state; state = aa_dfa_match(dfa, start, name); *perms = aa_compute_fperms(dfa, state, cond); return state; } int __aa_path_perm(const char *op, struct aa_profile *profile, const char *name, u32 request, struct path_cond *cond, int flags, struct aa_perms *perms) { int e = 0; if (profile_unconfined(profile)) return 0; aa_str_perms(profile->file.dfa, profile->file.start, name, cond, perms); if (request & ~perms->allow) e = -EACCES; return aa_audit_file(profile, perms, op, request, name, NULL, NULL, cond->uid, NULL, e); } static int profile_path_perm(const char *op, struct aa_profile *profile, const struct path *path, char *buffer, u32 request, struct path_cond *cond, int flags, struct aa_perms *perms) { const char *name; int error; if (profile_unconfined(profile)) return 0; error = path_name(op, &profile->label, path, flags | profile->path_flags, buffer, &name, cond, request); if (error) return error; return __aa_path_perm(op, profile, name, request, cond, flags, perms); } /** * aa_path_perm - do permissions check & audit for @path * @op: operation being checked * @label: profile being enforced (NOT NULL) * @path: path to check permissions of (NOT NULL) * @flags: any additional path flags beyond what the profile specifies * @request: requested permissions * @cond: conditional info for this request (NOT NULL) * * Returns: %0 else error if access denied or other error */ int aa_path_perm(const char *op, struct aa_label *label, const struct path *path, int flags, u32 request, struct path_cond *cond) { struct aa_perms perms = {}; struct aa_profile *profile; char *buffer = NULL; int error; flags |= PATH_DELEGATE_DELETED | (S_ISDIR(cond->mode) ? PATH_IS_DIR : 0); get_buffers(buffer); error = fn_for_each_confined(label, profile, profile_path_perm(op, profile, path, buffer, request, cond, flags, &perms)); put_buffers(buffer); return error; } /** * xindex_is_subset - helper for aa_path_link * @link: link permission set * @target: target permission set * * test target x permissions are equal OR a subset of link x permissions * this is done as part of the subset test, where a hardlink must have * a subset of permissions that the target has. * * Returns: %1 if subset else %0 */ static inline bool xindex_is_subset(u32 link, u32 target) { if (((link & ~AA_X_UNSAFE) != (target & ~AA_X_UNSAFE)) || ((link & AA_X_UNSAFE) && !(target & AA_X_UNSAFE))) return 0; return 1; } static int profile_path_link(struct aa_profile *profile, const struct path *link, char *buffer, const struct path *target, char *buffer2, struct path_cond *cond) { const char *lname, *tname = NULL; struct aa_perms lperms = {}, perms; const char *info = NULL; u32 request = AA_MAY_LINK; unsigned int state; int error; error = path_name(OP_LINK, &profile->label, link, profile->path_flags, buffer, &lname, cond, AA_MAY_LINK); if (error) goto audit; /* buffer2 freed below, tname is pointer in buffer2 */ error = path_name(OP_LINK, &profile->label, target, profile->path_flags, buffer2, &tname, cond, AA_MAY_LINK); if (error) goto audit; error = -EACCES; /* aa_str_perms - handles the case of the dfa being NULL */ state = aa_str_perms(profile->file.dfa, profile->file.start, lname, cond, &lperms); if (!(lperms.allow & AA_MAY_LINK)) goto audit; /* test to see if target can be paired with link */ state = aa_dfa_null_transition(profile->file.dfa, state); aa_str_perms(profile->file.dfa, state, tname, cond, &perms); /* force audit/quiet masks for link are stored in the second entry * in the link pair. */ lperms.audit = perms.audit; lperms.quiet = perms.quiet; lperms.kill = perms.kill; if (!(perms.allow & AA_MAY_LINK)) { info = "target restricted"; lperms = perms; goto audit; } /* done if link subset test is not required */ if (!(perms.allow & AA_LINK_SUBSET)) goto done_tests; /* Do link perm subset test requiring allowed permission on link are * a subset of the allowed permissions on target. */ aa_str_perms(profile->file.dfa, profile->file.start, tname, cond, &perms); /* AA_MAY_LINK is not considered in the subset test */ request = lperms.allow & ~AA_MAY_LINK; lperms.allow &= perms.allow | AA_MAY_LINK; request |= AA_AUDIT_FILE_MASK & (lperms.allow & ~perms.allow); if (request & ~lperms.allow) { goto audit; } else if ((lperms.allow & MAY_EXEC) && !xindex_is_subset(lperms.xindex, perms.xindex)) { lperms.allow &= ~MAY_EXEC; request |= MAY_EXEC; info = "link not subset of target"; goto audit; } done_tests: error = 0; audit: return aa_audit_file(profile, &lperms, OP_LINK, request, lname, tname, NULL, cond->uid, info, error); } /** * aa_path_link - Handle hard link permission check * @label: the label being enforced (NOT NULL) * @old_dentry: the target dentry (NOT NULL) * @new_dir: directory the new link will be created in (NOT NULL) * @new_dentry: the link being created (NOT NULL) * * Handle the permission test for a link & target pair. Permission * is encoded as a pair where the link permission is determined * first, and if allowed, the target is tested. The target test * is done from the point of the link match (not start of DFA) * making the target permission dependent on the link permission match. * * The subset test if required forces that permissions granted * on link are a subset of the permission granted to target. * * Returns: %0 if allowed else error */ int aa_path_link(struct aa_label *label, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { struct path link = { .mnt = new_dir->mnt, .dentry = new_dentry }; struct path target = { .mnt = new_dir->mnt, .dentry = old_dentry }; struct path_cond cond = { d_backing_inode(old_dentry)->i_uid, d_backing_inode(old_dentry)->i_mode }; char *buffer = NULL, *buffer2 = NULL; struct aa_profile *profile; int error; /* buffer freed below, lname is pointer in buffer */ get_buffers(buffer, buffer2); error = fn_for_each_confined(label, profile, profile_path_link(profile, &link, buffer, &target, buffer2, &cond)); put_buffers(buffer, buffer2); return error; } static void update_file_ctx(struct aa_file_ctx *fctx, struct aa_label *label, u32 request) { struct aa_label *l, *old; /* update caching of label on file_ctx */ spin_lock(&fctx->lock); old = rcu_dereference_protected(fctx->label, spin_is_locked(&fctx->lock)); l = aa_label_merge(old, label, GFP_ATOMIC); if (l) { if (l != old) { rcu_assign_pointer(fctx->label, l); aa_put_label(old); } else aa_put_label(l); fctx->allow |= request; } spin_unlock(&fctx->lock); } static int __file_path_perm(const char *op, struct aa_label *label, struct aa_label *flabel, struct file *file, u32 request, u32 denied) { struct aa_profile *profile; struct aa_perms perms = {}; struct path_cond cond = { .uid = file_inode(file)->i_uid, .mode = file_inode(file)->i_mode }; char *buffer; int flags, error; /* revalidation due to label out of date. No revocation at this time */ if (!denied && aa_label_is_subset(flabel, label)) /* TODO: check for revocation on stale profiles */ return 0; flags = PATH_DELEGATE_DELETED | (S_ISDIR(cond.mode) ? PATH_IS_DIR : 0); get_buffers(buffer); /* check every profile in task label not in current cache */ error = fn_for_each_not_in_set(flabel, label, profile, profile_path_perm(op, profile, &file->f_path, buffer, request, &cond, flags, &perms)); if (denied && !error) { /* * check every profile in file label that was not tested * in the initial check above. * * TODO: cache full perms so this only happens because of * conditionals * TODO: don't audit here */ if (label == flabel) error = fn_for_each(label, profile, profile_path_perm(op, profile, &file->f_path, buffer, request, &cond, flags, &perms)); else error = fn_for_each_not_in_set(label, flabel, profile, profile_path_perm(op, profile, &file->f_path, buffer, request, &cond, flags, &perms)); } if (!error) update_file_ctx(file_ctx(file), label, request); put_buffers(buffer); return error; } static int __file_sock_perm(const char *op, struct aa_label *label, struct aa_label *flabel, struct file *file, u32 request, u32 denied) { struct socket *sock = (struct socket *) file->private_data; int error; AA_BUG(!sock); /* revalidation due to label out of date. No revocation at this time */ if (!denied && aa_label_is_subset(flabel, label)) return 0; /* TODO: improve to skip profiles cached in flabel */ error = aa_sock_file_perm(label, op, request, sock); if (denied) { /* TODO: improve to skip profiles checked above */ /* check every profile in file label to is cached */ last_error(error, aa_sock_file_perm(flabel, op, request, sock)); } if (!error) update_file_ctx(file_ctx(file), label, request); return error; } /** * aa_file_perm - do permission revalidation check & audit for @file * @op: operation being checked * @label: label being enforced (NOT NULL) * @file: file to revalidate access permissions on (NOT NULL) * @request: requested permissions * * Returns: %0 if access allowed else error */ int aa_file_perm(const char *op, struct aa_label *label, struct file *file, u32 request) { struct aa_file_ctx *fctx; struct aa_label *flabel; u32 denied; int error = 0; AA_BUG(!label); AA_BUG(!file); fctx = file_ctx(file); rcu_read_lock(); flabel = rcu_dereference(fctx->label); AA_BUG(!flabel); /* revalidate access, if task is unconfined, or the cached cred * doesn't match or if the request is for more permissions than * was granted. * * Note: the test for !unconfined(flabel) is to handle file * delegation from unconfined tasks */ denied = request & ~fctx->allow; if (unconfined(label) || unconfined(flabel) || (!denied && aa_label_is_subset(flabel, label))) goto done; /* TODO: label cross check */ if (file->f_path.mnt && path_mediated_fs(file->f_path.dentry)) error = __file_path_perm(op, label, flabel, file, request, denied); else if (S_ISSOCK(file_inode(file)->i_mode)) error = __file_sock_perm(op, label, flabel, file, request, denied); done: rcu_read_unlock(); return error; } static void revalidate_tty(struct aa_label *label) { struct tty_struct *tty; int drop_tty = 0; tty = get_current_tty(); if (!tty) return; spin_lock(&tty->files_lock); if (!list_empty(&tty->tty_files)) { struct tty_file_private *file_priv; struct file *file; /* TODO: Revalidate access to controlling tty. */ file_priv = list_first_entry(&tty->tty_files, struct tty_file_private, list); file = file_priv->file; if (aa_file_perm(OP_INHERIT, label, file, MAY_READ | MAY_WRITE)) drop_tty = 1; } spin_unlock(&tty->files_lock); tty_kref_put(tty); if (drop_tty) no_tty(); } static int match_file(const void *p, struct file *file, unsigned int fd) { struct aa_label *label = (struct aa_label *)p; if (aa_file_perm(OP_INHERIT, label, file, aa_map_file_to_perms(file))) return fd + 1; return 0; } /* based on selinux's flush_unauthorized_files */ void aa_inherit_files(const struct cred *cred, struct files_struct *files) { struct aa_label *label = aa_get_newest_cred_label(cred); struct file *devnull = NULL; unsigned int n; revalidate_tty(label); /* Revalidate access to inherited open files. */ n = iterate_fd(files, 0, match_file, label); if (!n) /* none found? */ goto out; devnull = dentry_open(&aa_null, O_RDWR, cred); if (IS_ERR(devnull)) devnull = NULL; /* replace all the matching ones with this */ do { replace_fd(n - 1, devnull, 0); } while ((n = iterate_fd(files, n, match_file, label)) != 0); if (devnull) fput(devnull); out: aa_put_label(label); } |
44 4 40 44 44 12 12 12 12 12 11 12 1 1 11 10 44 12 12 10 44 6 4 14 1 31 49 49 49 35 35 4 31 35 35 35 35 38 38 9 38 35 38 38 9 9 9 38 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 | /* RxRPC remote transport endpoint record management * * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/udp.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/slab.h> #include <linux/hashtable.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <net/ip.h> #include <net/route.h> #include <net/ip6_route.h> #include "ar-internal.h" /* * Hash a peer key. */ static unsigned long rxrpc_peer_hash_key(struct rxrpc_local *local, const struct sockaddr_rxrpc *srx) { const u16 *p; unsigned int i, size; unsigned long hash_key; _enter(""); hash_key = (unsigned long)local / __alignof__(*local); hash_key += srx->transport_type; hash_key += srx->transport_len; hash_key += srx->transport.family; switch (srx->transport.family) { case AF_INET: hash_key += (u16 __force)srx->transport.sin.sin_port; size = sizeof(srx->transport.sin.sin_addr); p = (u16 *)&srx->transport.sin.sin_addr; break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: hash_key += (u16 __force)srx->transport.sin.sin_port; size = sizeof(srx->transport.sin6.sin6_addr); p = (u16 *)&srx->transport.sin6.sin6_addr; break; #endif default: WARN(1, "AF_RXRPC: Unsupported transport address family\n"); return 0; } /* Step through the peer address in 16-bit portions for speed */ for (i = 0; i < size; i += sizeof(*p), p++) hash_key += *p; _leave(" 0x%lx", hash_key); return hash_key; } /* * Compare a peer to a key. Return -ve, 0 or +ve to indicate less than, same * or greater than. * * Unfortunately, the primitives in linux/hashtable.h don't allow for sorted * buckets and mid-bucket insertion, so we don't make full use of this * information at this point. */ static long rxrpc_peer_cmp_key(const struct rxrpc_peer *peer, struct rxrpc_local *local, const struct sockaddr_rxrpc *srx, unsigned long hash_key) { long diff; diff = ((peer->hash_key - hash_key) ?: ((unsigned long)peer->local - (unsigned long)local) ?: (peer->srx.transport_type - srx->transport_type) ?: (peer->srx.transport_len - srx->transport_len) ?: (peer->srx.transport.family - srx->transport.family)); if (diff != 0) return diff; switch (srx->transport.family) { case AF_INET: return ((u16 __force)peer->srx.transport.sin.sin_port - (u16 __force)srx->transport.sin.sin_port) ?: memcmp(&peer->srx.transport.sin.sin_addr, &srx->transport.sin.sin_addr, sizeof(struct in_addr)); #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: return ((u16 __force)peer->srx.transport.sin6.sin6_port - (u16 __force)srx->transport.sin6.sin6_port) ?: memcmp(&peer->srx.transport.sin6.sin6_addr, &srx->transport.sin6.sin6_addr, sizeof(struct in6_addr)); #endif default: BUG(); } } /* * Look up a remote transport endpoint for the specified address using RCU. */ static struct rxrpc_peer *__rxrpc_lookup_peer_rcu( struct rxrpc_local *local, const struct sockaddr_rxrpc *srx, unsigned long hash_key) { struct rxrpc_peer *peer; struct rxrpc_net *rxnet = local->rxnet; hash_for_each_possible_rcu(rxnet->peer_hash, peer, hash_link, hash_key) { if (rxrpc_peer_cmp_key(peer, local, srx, hash_key) == 0 && atomic_read(&peer->usage) > 0) return peer; } return NULL; } /* * Look up a remote transport endpoint for the specified address using RCU. */ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local, const struct sockaddr_rxrpc *srx) { struct rxrpc_peer *peer; unsigned long hash_key = rxrpc_peer_hash_key(local, srx); peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer) { _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport); _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage)); } return peer; } /* * assess the MTU size for the network interface through which this peer is * reached */ static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx, struct rxrpc_peer *peer) { struct net *net = sock_net(&rx->sk); struct dst_entry *dst; struct rtable *rt; struct flowi fl; struct flowi4 *fl4 = &fl.u.ip4; #ifdef CONFIG_AF_RXRPC_IPV6 struct flowi6 *fl6 = &fl.u.ip6; #endif peer->if_mtu = 1500; memset(&fl, 0, sizeof(fl)); switch (peer->srx.transport.family) { case AF_INET: rt = ip_route_output_ports( net, fl4, NULL, peer->srx.transport.sin.sin_addr.s_addr, 0, htons(7000), htons(7001), IPPROTO_UDP, 0, 0); if (IS_ERR(rt)) { _leave(" [route err %ld]", PTR_ERR(rt)); return; } dst = &rt->dst; break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: fl6->flowi6_iif = LOOPBACK_IFINDEX; fl6->flowi6_scope = RT_SCOPE_UNIVERSE; fl6->flowi6_proto = IPPROTO_UDP; memcpy(&fl6->daddr, &peer->srx.transport.sin6.sin6_addr, sizeof(struct in6_addr)); fl6->fl6_dport = htons(7001); fl6->fl6_sport = htons(7000); dst = ip6_route_output(net, NULL, fl6); if (dst->error) { _leave(" [route err %d]", dst->error); return; } break; #endif default: BUG(); } peer->if_mtu = dst_mtu(dst); dst_release(dst); _leave(" [if_mtu %u]", peer->if_mtu); } /* * Allocate a peer. */ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) { struct rxrpc_peer *peer; _enter(""); peer = kzalloc(sizeof(struct rxrpc_peer), gfp); if (peer) { atomic_set(&peer->usage, 1); peer->local = rxrpc_get_local(local); INIT_HLIST_HEAD(&peer->error_targets); peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); spin_lock_init(&peer->rtt_input_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); if (RXRPC_TX_SMSS > 2190) peer->cong_cwnd = 2; else if (RXRPC_TX_SMSS > 1095) peer->cong_cwnd = 3; else peer->cong_cwnd = 4; } _leave(" = %p", peer); return peer; } /* * Initialise peer record. */ static void rxrpc_init_peer(struct rxrpc_sock *rx, struct rxrpc_peer *peer, unsigned long hash_key) { peer->hash_key = hash_key; rxrpc_assess_MTU_size(rx, peer); peer->mtu = peer->if_mtu; peer->rtt_last_req = ktime_get_real(); switch (peer->srx.transport.family) { case AF_INET: peer->hdrsize = sizeof(struct iphdr); break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: peer->hdrsize = sizeof(struct ipv6hdr); break; #endif default: BUG(); } switch (peer->srx.transport_type) { case SOCK_DGRAM: peer->hdrsize += sizeof(struct udphdr); break; default: BUG(); } peer->hdrsize += sizeof(struct rxrpc_wire_header); peer->maxdata = peer->mtu - peer->hdrsize; } /* * Set up a new peer. */ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, struct sockaddr_rxrpc *srx, unsigned long hash_key, gfp_t gfp) { struct rxrpc_peer *peer; _enter(""); peer = rxrpc_alloc_peer(local, gfp); if (peer) { memcpy(&peer->srx, srx, sizeof(*srx)); rxrpc_init_peer(rx, peer, hash_key); } _leave(" = %p", peer); return peer; } static void rxrpc_free_peer(struct rxrpc_peer *peer) { rxrpc_put_local(peer->local); kfree_rcu(peer, rcu); } /* * Set up a new incoming peer. There shouldn't be any other matching peers * since we've already done a search in the list from the non-reentrant context * (the data_ready handler) that is the only place we can add new peers. */ void rxrpc_new_incoming_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, struct rxrpc_peer *peer) { struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key; hash_key = rxrpc_peer_hash_key(local, &peer->srx); rxrpc_init_peer(rx, peer, hash_key); spin_lock(&rxnet->peer_hash_lock); hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new); spin_unlock(&rxnet->peer_hash_lock); } /* * obtain a remote transport endpoint for the specified address */ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, struct rxrpc_local *local, struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_peer *peer, *candidate; struct rxrpc_net *rxnet = local->rxnet; unsigned long hash_key = rxrpc_peer_hash_key(local, srx); _enter("{%pISp}", &srx->transport); /* search the peer list first */ rcu_read_lock(); peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; rcu_read_unlock(); if (!peer) { /* The peer is not yet present in hash - create a candidate * for a new record and then redo the search. */ candidate = rxrpc_create_peer(rx, local, srx, hash_key, gfp); if (!candidate) { _leave(" = NULL [nomem]"); return NULL; } spin_lock_bh(&rxnet->peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); if (peer && !rxrpc_get_peer_maybe(peer)) peer = NULL; if (!peer) { hash_add_rcu(rxnet->peer_hash, &candidate->hash_link, hash_key); list_add_tail(&candidate->keepalive_link, &rxnet->peer_keepalive_new); } spin_unlock_bh(&rxnet->peer_hash_lock); if (peer) rxrpc_free_peer(candidate); else peer = candidate; } _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport); _leave(" = %p {u=%d}", peer, atomic_read(&peer->usage)); return peer; } /* * Get a ref on a peer record. */ struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); int n; n = atomic_inc_return(&peer->usage); trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, n, here); return peer; } /* * Get a ref on a peer record unless its usage has already reached 0. */ struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); if (peer) { int n = atomic_fetch_add_unless(&peer->usage, 1, 0); if (n > 0) trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, n + 1, here); else peer = NULL; } return peer; } /* * Discard a peer record. */ static void __rxrpc_put_peer(struct rxrpc_peer *peer) { struct rxrpc_net *rxnet = peer->local->rxnet; ASSERT(hlist_empty(&peer->error_targets)); spin_lock_bh(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_free_peer(peer); } /* * Drop a ref on a peer record. */ void rxrpc_put_peer(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); unsigned int debug_id; int n; if (peer) { debug_id = peer->debug_id; n = atomic_dec_return(&peer->usage); trace_rxrpc_peer(debug_id, rxrpc_peer_put, n, here); if (n == 0) __rxrpc_put_peer(peer); } } /* * Drop a ref on a peer record where the caller already holds the * peer_hash_lock. */ void rxrpc_put_peer_locked(struct rxrpc_peer *peer) { const void *here = __builtin_return_address(0); unsigned int debug_id = peer->debug_id; int n; n = atomic_dec_return(&peer->usage); trace_rxrpc_peer(debug_id, rxrpc_peer_put, n, here); if (n == 0) { hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); rxrpc_free_peer(peer); } } /* * Make sure all peer records have been discarded. */ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) { struct rxrpc_peer *peer; int i; for (i = 0; i < HASH_SIZE(rxnet->peer_hash); i++) { if (hlist_empty(&rxnet->peer_hash[i])) continue; hlist_for_each_entry(peer, &rxnet->peer_hash[i], hash_link) { pr_err("Leaked peer %u {%u} %pISp\n", peer->debug_id, atomic_read(&peer->usage), &peer->srx.transport); } } } /** * rxrpc_kernel_get_peer - Get the peer address of a call * @sock: The socket on which the call is in progress. * @call: The call to query * @_srx: Where to place the result * * Get the address of the remote peer in a call. */ void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, struct sockaddr_rxrpc *_srx) { *_srx = call->peer->srx; } EXPORT_SYMBOL(rxrpc_kernel_get_peer); /** * rxrpc_kernel_get_rtt - Get a call's peer RTT * @sock: The socket on which the call is in progress. * @call: The call to query * * Get the call's peer RTT. */ u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call) { return call->peer->rtt; } EXPORT_SYMBOL(rxrpc_kernel_get_rtt); |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM udp #if !defined(_TRACE_UDP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_UDP_H #include <linux/udp.h> #include <linux/tracepoint.h> TRACE_EVENT(udp_fail_queue_rcv_skb, TP_PROTO(int rc, struct sock *sk), TP_ARGS(rc, sk), TP_STRUCT__entry( __field(int, rc) __field(__u16, lport) ), TP_fast_assign( __entry->rc = rc; __entry->lport = inet_sk(sk)->inet_num; ), TP_printk("rc=%d port=%hu", __entry->rc, __entry->lport) ); #endif /* _TRACE_UDP_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
3 2 1 1 1 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation version 2 and no later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. */ #include <linux/vmw_vmci_defs.h> #include <linux/hash.h> #include <linux/types.h> #include <linux/rculist.h> #include <linux/completion.h> #include "vmci_resource.h" #include "vmci_driver.h" #define VMCI_RESOURCE_HASH_BITS 7 #define VMCI_RESOURCE_HASH_BUCKETS (1 << VMCI_RESOURCE_HASH_BITS) struct vmci_hash_table { spinlock_t lock; struct hlist_head entries[VMCI_RESOURCE_HASH_BUCKETS]; }; static struct vmci_hash_table vmci_resource_table = { .lock = __SPIN_LOCK_UNLOCKED(vmci_resource_table.lock), }; static unsigned int vmci_resource_hash(struct vmci_handle handle) { return hash_32(handle.resource, VMCI_RESOURCE_HASH_BITS); } /* * Gets a resource (if one exists) matching given handle from the hash table. */ static struct vmci_resource *vmci_resource_lookup(struct vmci_handle handle, enum vmci_resource_type type) { struct vmci_resource *r, *resource = NULL; unsigned int idx = vmci_resource_hash(handle); rcu_read_lock(); hlist_for_each_entry_rcu(r, &vmci_resource_table.entries[idx], node) { u32 cid = r->handle.context; u32 rid = r->handle.resource; if (r->type == type && rid == handle.resource && (cid == handle.context || cid == VMCI_INVALID_ID || handle.context == VMCI_INVALID_ID)) { resource = r; break; } } rcu_read_unlock(); return resource; } /* * Find an unused resource ID and return it. The first * VMCI_RESERVED_RESOURCE_ID_MAX are reserved so we start from * its value + 1. * Returns VMCI resource id on success, VMCI_INVALID_ID on failure. */ static u32 vmci_resource_find_id(u32 context_id, enum vmci_resource_type resource_type) { static u32 resource_id = VMCI_RESERVED_RESOURCE_ID_MAX + 1; u32 old_rid = resource_id; u32 current_rid; /* * Generate a unique resource ID. Keep on trying until we wrap around * in the RID space. */ do { struct vmci_handle handle; current_rid = resource_id; resource_id++; if (unlikely(resource_id == VMCI_INVALID_ID)) { /* Skip the reserved rids. */ resource_id = VMCI_RESERVED_RESOURCE_ID_MAX + 1; } handle = vmci_make_handle(context_id, current_rid); if (!vmci_resource_lookup(handle, resource_type)) return current_rid; } while (resource_id != old_rid); return VMCI_INVALID_ID; } int vmci_resource_add(struct vmci_resource *resource, enum vmci_resource_type resource_type, struct vmci_handle handle) { unsigned int idx; int result; spin_lock(&vmci_resource_table.lock); if (handle.resource == VMCI_INVALID_ID) { handle.resource = vmci_resource_find_id(handle.context, resource_type); if (handle.resource == VMCI_INVALID_ID) { result = VMCI_ERROR_NO_HANDLE; goto out; } } else if (vmci_resource_lookup(handle, resource_type)) { result = VMCI_ERROR_ALREADY_EXISTS; goto out; } resource->handle = handle; resource->type = resource_type; INIT_HLIST_NODE(&resource->node); kref_init(&resource->kref); init_completion(&resource->done); idx = vmci_resource_hash(resource->handle); hlist_add_head_rcu(&resource->node, &vmci_resource_table.entries[idx]); result = VMCI_SUCCESS; out: spin_unlock(&vmci_resource_table.lock); return result; } void vmci_resource_remove(struct vmci_resource *resource) { struct vmci_handle handle = resource->handle; unsigned int idx = vmci_resource_hash(handle); struct vmci_resource *r; /* Remove resource from hash table. */ spin_lock(&vmci_resource_table.lock); hlist_for_each_entry(r, &vmci_resource_table.entries[idx], node) { if (vmci_handle_is_equal(r->handle, resource->handle)) { hlist_del_init_rcu(&r->node); break; } } spin_unlock(&vmci_resource_table.lock); synchronize_rcu(); vmci_resource_put(resource); wait_for_completion(&resource->done); } struct vmci_resource * vmci_resource_by_handle(struct vmci_handle resource_handle, enum vmci_resource_type resource_type) { struct vmci_resource *r, *resource = NULL; rcu_read_lock(); r = vmci_resource_lookup(resource_handle, resource_type); if (r && (resource_type == r->type || resource_type == VMCI_RESOURCE_TYPE_ANY)) { resource = vmci_resource_get(r); } rcu_read_unlock(); return resource; } /* * Get a reference to given resource. */ struct vmci_resource *vmci_resource_get(struct vmci_resource *resource) { kref_get(&resource->kref); return resource; } static void vmci_release_resource(struct kref *kref) { struct vmci_resource *resource = container_of(kref, struct vmci_resource, kref); /* Verify the resource has been unlinked from hash table */ WARN_ON(!hlist_unhashed(&resource->node)); /* Signal that container of this resource can now be destroyed */ complete(&resource->done); } /* * Resource's release function will get called if last reference. * If it is the last reference, then we are sure that nobody else * can increment the count again (it's gone from the resource hash * table), so there's no need for locking here. */ int vmci_resource_put(struct vmci_resource *resource) { /* * We propagate the information back to caller in case it wants to know * whether entry was freed. */ return kref_put(&resource->kref, vmci_release_resource) ? VMCI_SUCCESS_ENTRY_DEAD : VMCI_SUCCESS; } struct vmci_handle vmci_resource_handle(struct vmci_resource *resource) { return resource->handle; } |
70 70 79 75 74 74 74 74 70 74 74 79 70 22 22 4 4 11 11 10 10 3 11 3 3 3 3 1 15 3 14 14 14 13 4 12 13 12 12 12 12 12 12 12 12 11 11 11 34 34 3 3 2 2 34 1 1 29 29 1 23 18 2 13 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 | /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth address family and sockets. */ #include <linux/module.h> #include <linux/debugfs.h> #include <linux/stringify.h> #include <linux/sched/signal.h> #include <asm/ioctls.h> #include <net/bluetooth/bluetooth.h> #include <linux/proc_fs.h> #include "leds.h" #include "selftest.h" /* Bluetooth sockets */ #define BT_MAX_PROTO 8 static const struct net_proto_family *bt_proto[BT_MAX_PROTO]; static DEFINE_RWLOCK(bt_proto_lock); static struct lock_class_key bt_lock_key[BT_MAX_PROTO]; static const char *const bt_key_strings[BT_MAX_PROTO] = { "sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP", "sk_lock-AF_BLUETOOTH-BTPROTO_HCI", "sk_lock-AF_BLUETOOTH-BTPROTO_SCO", "sk_lock-AF_BLUETOOTH-BTPROTO_RFCOMM", "sk_lock-AF_BLUETOOTH-BTPROTO_BNEP", "sk_lock-AF_BLUETOOTH-BTPROTO_CMTP", "sk_lock-AF_BLUETOOTH-BTPROTO_HIDP", "sk_lock-AF_BLUETOOTH-BTPROTO_AVDTP", }; static struct lock_class_key bt_slock_key[BT_MAX_PROTO]; static const char *const bt_slock_key_strings[BT_MAX_PROTO] = { "slock-AF_BLUETOOTH-BTPROTO_L2CAP", "slock-AF_BLUETOOTH-BTPROTO_HCI", "slock-AF_BLUETOOTH-BTPROTO_SCO", "slock-AF_BLUETOOTH-BTPROTO_RFCOMM", "slock-AF_BLUETOOTH-BTPROTO_BNEP", "slock-AF_BLUETOOTH-BTPROTO_CMTP", "slock-AF_BLUETOOTH-BTPROTO_HIDP", "slock-AF_BLUETOOTH-BTPROTO_AVDTP", }; void bt_sock_reclassify_lock(struct sock *sk, int proto) { BUG_ON(!sk); BUG_ON(!sock_allow_reclassification(sk)); sock_lock_init_class_and_name(sk, bt_slock_key_strings[proto], &bt_slock_key[proto], bt_key_strings[proto], &bt_lock_key[proto]); } EXPORT_SYMBOL(bt_sock_reclassify_lock); int bt_sock_register(int proto, const struct net_proto_family *ops) { int err = 0; if (proto < 0 || proto >= BT_MAX_PROTO) return -EINVAL; write_lock(&bt_proto_lock); if (bt_proto[proto]) err = -EEXIST; else bt_proto[proto] = ops; write_unlock(&bt_proto_lock); return err; } EXPORT_SYMBOL(bt_sock_register); void bt_sock_unregister(int proto) { if (proto < 0 || proto >= BT_MAX_PROTO) return; write_lock(&bt_proto_lock); bt_proto[proto] = NULL; write_unlock(&bt_proto_lock); } EXPORT_SYMBOL(bt_sock_unregister); static int bt_sock_create(struct net *net, struct socket *sock, int proto, int kern) { int err; if (net != &init_net) return -EAFNOSUPPORT; if (proto < 0 || proto >= BT_MAX_PROTO) return -EINVAL; if (!bt_proto[proto]) request_module("bt-proto-%d", proto); err = -EPROTONOSUPPORT; read_lock(&bt_proto_lock); if (bt_proto[proto] && try_module_get(bt_proto[proto]->owner)) { err = bt_proto[proto]->create(net, sock, proto, kern); if (!err) bt_sock_reclassify_lock(sock->sk, proto); module_put(bt_proto[proto]->owner); } read_unlock(&bt_proto_lock); return err; } void bt_sock_link(struct bt_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_add_node(sk, &l->head); write_unlock(&l->lock); } EXPORT_SYMBOL(bt_sock_link); void bt_sock_unlink(struct bt_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_del_node_init(sk); write_unlock(&l->lock); } EXPORT_SYMBOL(bt_sock_unlink); void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) { BT_DBG("parent %p, sk %p", parent, sk); sock_hold(sk); if (bh) bh_lock_sock_nested(sk); else lock_sock_nested(sk, SINGLE_DEPTH_NESTING); list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); bt_sk(sk)->parent = parent; if (bh) bh_unlock_sock(sk); else release_sock(sk); parent->sk_ack_backlog++; } EXPORT_SYMBOL(bt_accept_enqueue); /* Calling function must hold the sk lock. * bt_sk(sk)->parent must be non-NULL meaning sk is in the parent list. */ void bt_accept_unlink(struct sock *sk) { BT_DBG("sk %p state %d", sk, sk->sk_state); list_del_init(&bt_sk(sk)->accept_q); bt_sk(sk)->parent->sk_ack_backlog--; bt_sk(sk)->parent = NULL; sock_put(sk); } EXPORT_SYMBOL(bt_accept_unlink); struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) { struct bt_sock *s, *n; struct sock *sk; BT_DBG("parent %p", parent); restart: list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { sk = (struct sock *)s; /* Prevent early freeing of sk due to unlink and sock_kill */ sock_hold(sk); lock_sock(sk); /* Check sk has not already been unlinked via * bt_accept_unlink() due to serialisation caused by sk locking */ if (!bt_sk(sk)->parent) { BT_DBG("sk %p, already unlinked", sk); release_sock(sk); sock_put(sk); /* Restart the loop as sk is no longer in the list * and also avoid a potential infinite loop because * list_for_each_entry_safe() is not thread safe. */ goto restart; } /* sk is safely in the parent list so reduce reference count */ sock_put(sk); /* FIXME: Is this check still needed */ if (sk->sk_state == BT_CLOSED) { bt_accept_unlink(sk); release_sock(sk); continue; } if (sk->sk_state == BT_CONNECTED || !newsock || test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) { bt_accept_unlink(sk); if (newsock) sock_graft(sk, newsock); release_sock(sk); return sk; } release_sock(sk); } return NULL; } EXPORT_SYMBOL(bt_accept_dequeue); int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { int noblock = flags & MSG_DONTWAIT; struct sock *sk = sock->sk; struct sk_buff *skb; size_t copied; size_t skblen; int err; BT_DBG("sock %p sk %p len %zu", sock, sk, len); if (flags & MSG_OOB) return -EOPNOTSUPP; skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) { if (sk->sk_shutdown & RCV_SHUTDOWN) return 0; return err; } skblen = skb->len; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } skb_reset_transport_header(skb); err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err == 0) { sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name && bt_sk(sk)->skb_msg_name) bt_sk(sk)->skb_msg_name(skb, msg->msg_name, &msg->msg_namelen); } skb_free_datagram(sk, skb); if (flags & MSG_TRUNC) copied = skblen; return err ? : copied; } EXPORT_SYMBOL(bt_sock_recvmsg); static long bt_sock_data_wait(struct sock *sk, long timeo) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(sk_sleep(sk), &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (!skb_queue_empty(&sk->sk_receive_queue)) break; if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN)) break; if (signal_pending(current) || !timeo) break; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return timeo; } int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int err = 0; size_t target, copied = 0; long timeo; if (flags & MSG_OOB) return -EOPNOTSUPP; BT_DBG("sk %p size %zu", sk, size); lock_sock(sk); target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { struct sk_buff *skb; int chunk; skb = skb_dequeue(&sk->sk_receive_queue); if (!skb) { if (copied >= target) break; err = sock_error(sk); if (err) break; if (sk->sk_shutdown & RCV_SHUTDOWN) break; err = -EAGAIN; if (!timeo) break; timeo = bt_sock_data_wait(sk, timeo); if (signal_pending(current)) { err = sock_intr_errno(timeo); goto out; } continue; } chunk = min_t(unsigned int, skb->len, size); if (skb_copy_datagram_msg(skb, 0, msg, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (!copied) copied = -EFAULT; break; } copied += chunk; size -= chunk; sock_recv_ts_and_drops(msg, sk, skb); if (!(flags & MSG_PEEK)) { int skb_len = skb_headlen(skb); if (chunk <= skb_len) { __skb_pull(skb, chunk); } else { struct sk_buff *frag; __skb_pull(skb, skb_len); chunk -= skb_len; skb_walk_frags(skb, frag) { if (chunk <= frag->len) { /* Pulling partial data */ skb->len -= chunk; skb->data_len -= chunk; __skb_pull(frag, chunk); break; } else if (frag->len) { /* Pulling all frag data */ chunk -= frag->len; skb->len -= frag->len; skb->data_len -= frag->len; __skb_pull(frag, frag->len); } } } if (skb->len) { skb_queue_head(&sk->sk_receive_queue, skb); break; } kfree_skb(skb); } else { /* put message back and return */ skb_queue_head(&sk->sk_receive_queue, skb); break; } } while (size); out: release_sock(sk); return copied ? : err; } EXPORT_SYMBOL(bt_sock_stream_recvmsg); static inline __poll_t bt_accept_poll(struct sock *parent) { struct bt_sock *s, *n; struct sock *sk; list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { sk = (struct sock *)s; if (sk->sk_state == BT_CONNECTED || (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags) && sk->sk_state == BT_CONNECT2)) return EPOLLIN | EPOLLRDNORM; } return 0; } __poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; BT_DBG("sock %p, sk %p", sock, sk); poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == BT_LISTEN) return bt_accept_poll(sk); if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR | (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= EPOLLHUP; if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; if (sk->sk_state == BT_CLOSED) mask |= EPOLLHUP; if (sk->sk_state == BT_CONNECT || sk->sk_state == BT_CONNECT2 || sk->sk_state == BT_CONFIG) return mask; if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk)) mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND; else sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); return mask; } EXPORT_SYMBOL(bt_sock_poll); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct sk_buff *skb; long amount; int err; BT_DBG("sk %p cmd %x arg %lx", sk, cmd, arg); switch (cmd) { case TIOCOUTQ: if (sk->sk_state == BT_LISTEN) return -EINVAL; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; err = put_user(amount, (int __user *) arg); break; case TIOCINQ: if (sk->sk_state == BT_LISTEN) return -EINVAL; lock_sock(sk); skb = skb_peek(&sk->sk_receive_queue); amount = skb ? skb->len : 0; release_sock(sk); err = put_user(amount, (int __user *) arg); break; case SIOCGSTAMP: err = sock_get_timestamp(sk, (struct timeval __user *) arg); break; case SIOCGSTAMPNS: err = sock_get_timestampns(sk, (struct timespec __user *) arg); break; default: err = -ENOIOCTLCMD; break; } return err; } EXPORT_SYMBOL(bt_sock_ioctl); /* This function expects the sk lock to be held when called */ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo) { DECLARE_WAITQUEUE(wait, current); int err = 0; BT_DBG("sk %p", sk); add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); while (sk->sk_state != state) { if (!timeo) { err = -EINPROGRESS; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); set_current_state(TASK_INTERRUPTIBLE); err = sock_error(sk); if (err) break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return err; } EXPORT_SYMBOL(bt_sock_wait_state); /* This function expects the sk lock to be held when called */ int bt_sock_wait_ready(struct sock *sk, unsigned long flags) { DECLARE_WAITQUEUE(wait, current); unsigned long timeo; int err = 0; BT_DBG("sk %p", sk); timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); while (test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags)) { if (!timeo) { err = -EAGAIN; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); set_current_state(TASK_INTERRUPTIBLE); err = sock_error(sk); if (err) break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return err; } EXPORT_SYMBOL(bt_sock_wait_ready); #ifdef CONFIG_PROC_FS static void *bt_seq_start(struct seq_file *seq, loff_t *pos) __acquires(seq->private->l->lock) { struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); read_lock(&l->lock); return seq_hlist_start_head(&l->head, *pos); } static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); return seq_hlist_next(v, &l->head, pos); } static void bt_seq_stop(struct seq_file *seq, void *v) __releases(seq->private->l->lock) { struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); read_unlock(&l->lock); } static int bt_seq_show(struct seq_file *seq, void *v) { struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); if (v == SEQ_START_TOKEN) { seq_puts(seq ,"sk RefCnt Rmem Wmem User Inode Parent"); if (l->custom_seq_show) { seq_putc(seq, ' '); l->custom_seq_show(seq, v); } seq_putc(seq, '\n'); } else { struct sock *sk = sk_entry(v); struct bt_sock *bt = bt_sk(sk); seq_printf(seq, "%pK %-6d %-6u %-6u %-6u %-6lu %-6lu", sk, refcount_read(&sk->sk_refcnt), sk_rmem_alloc_get(sk), sk_wmem_alloc_get(sk), from_kuid(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk), bt->parent? sock_i_ino(bt->parent): 0LU); if (l->custom_seq_show) { seq_putc(seq, ' '); l->custom_seq_show(seq, v); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations bt_seq_ops = { .start = bt_seq_start, .next = bt_seq_next, .stop = bt_seq_stop, .show = bt_seq_show, }; int bt_procfs_init(struct net *net, const char *name, struct bt_sock_list *sk_list, int (* seq_show)(struct seq_file *, void *)) { sk_list->custom_seq_show = seq_show; if (!proc_create_seq_data(name, 0, net->proc_net, &bt_seq_ops, sk_list)) return -ENOMEM; return 0; } void bt_procfs_cleanup(struct net *net, const char *name) { remove_proc_entry(name, net->proc_net); } #else int bt_procfs_init(struct net *net, const char *name, struct bt_sock_list *sk_list, int (* seq_show)(struct seq_file *, void *)) { return 0; } void bt_procfs_cleanup(struct net *net, const char *name) { } #endif EXPORT_SYMBOL(bt_procfs_init); EXPORT_SYMBOL(bt_procfs_cleanup); static const struct net_proto_family bt_sock_family_ops = { .owner = THIS_MODULE, .family = PF_BLUETOOTH, .create = bt_sock_create, }; struct dentry *bt_debugfs; EXPORT_SYMBOL_GPL(bt_debugfs); #define VERSION __stringify(BT_SUBSYS_VERSION) "." \ __stringify(BT_SUBSYS_REVISION) static int __init bt_init(void) { int err; sock_skb_cb_check_size(sizeof(struct bt_skb_cb)); BT_INFO("Core ver %s", VERSION); err = bt_selftest(); if (err < 0) return err; bt_debugfs = debugfs_create_dir("bluetooth", NULL); bt_leds_init(); err = bt_sysfs_init(); if (err < 0) return err; err = sock_register(&bt_sock_family_ops); if (err) goto cleanup_sysfs; BT_INFO("HCI device and connection manager initialized"); err = hci_sock_init(); if (err) goto unregister_socket; err = l2cap_init(); if (err) goto cleanup_socket; err = sco_init(); if (err) goto cleanup_cap; err = mgmt_init(); if (err) goto cleanup_sco; return 0; cleanup_sco: sco_exit(); cleanup_cap: l2cap_exit(); cleanup_socket: hci_sock_cleanup(); unregister_socket: sock_unregister(PF_BLUETOOTH); cleanup_sysfs: bt_sysfs_cleanup(); return err; } static void __exit bt_exit(void) { mgmt_exit(); sco_exit(); l2cap_exit(); hci_sock_cleanup(); sock_unregister(PF_BLUETOOTH); bt_sysfs_cleanup(); bt_leds_cleanup(); debugfs_remove_recursive(bt_debugfs); } subsys_initcall(bt_init); module_exit(bt_exit); MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>"); MODULE_DESCRIPTION("Bluetooth Core ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_BLUETOOTH); |
8 7 6 6 4 4 4 1 4 1 4 8 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 | #define pr_fmt(fmt) "IPsec: " fmt #include <crypto/algapi.h> #include <crypto/hash.h> #include <linux/err.h> #include <linux/module.h> #include <linux/slab.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ah.h> #include <linux/crypto.h> #include <linux/pfkeyv2.h> #include <linux/scatterlist.h> #include <net/icmp.h> #include <net/protocol.h> struct ah_skb_cb { struct xfrm_skb_cb xfrm; void *tmp; }; #define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, unsigned int size) { unsigned int len; len = size + crypto_ahash_digestsize(ahash) + (crypto_ahash_alignmask(ahash) & ~(crypto_tfm_ctx_alignment() - 1)); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash); len = ALIGN(len, __alignof__(struct scatterlist)); len += sizeof(struct scatterlist) * nfrags; return kmalloc(len, GFP_ATOMIC); } static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset) { return tmp + offset; } static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp, unsigned int offset) { return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1); } static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash, u8 *icv) { struct ahash_request *req; req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash), crypto_tfm_ctx_alignment()); ahash_request_set_tfm(req, ahash); return req; } static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, struct ahash_request *req) { return (void *)ALIGN((unsigned long)(req + 1) + crypto_ahash_reqsize(ahash), __alignof__(struct scatterlist)); } /* Clear mutable options and find final destination to substitute * into IP header for icv calculation. Options are already checked * for validity, so paranoia is not required. */ static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) { unsigned char *optptr = (unsigned char *)(iph+1); int l = iph->ihl*4 - sizeof(struct iphdr); int optlen; while (l > 0) { switch (*optptr) { case IPOPT_END: return 0; case IPOPT_NOOP: l--; optptr++; continue; } optlen = optptr[1]; if (optlen<2 || optlen>l) return -EINVAL; switch (*optptr) { case IPOPT_SEC: case 0x85: /* Some "Extended Security" crap. */ case IPOPT_CIPSO: case IPOPT_RA: case 0x80|21: /* RFC1770 */ break; case IPOPT_LSRR: case IPOPT_SSRR: if (optlen < 6) return -EINVAL; memcpy(daddr, optptr+optlen-4, 4); /* Fall through */ default: memset(optptr, 0, optlen); } l -= optlen; optptr += optlen; } return 0; } static void ah_output_done(struct crypto_async_request *base, int err) { u8 *icv; struct iphdr *iph; struct sk_buff *skb = base->data; struct xfrm_state *x = skb_dst(skb)->xfrm; struct ah_data *ahp = x->data; struct iphdr *top_iph = ip_hdr(skb); struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); iph = AH_SKB_CB(skb)->tmp; icv = ah_tmp_icv(ahp->ahash, iph, ihl); memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; top_iph->frag_off = iph->frag_off; if (top_iph->ihl != 5) { top_iph->daddr = iph->daddr; memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb, err); } static int ah_output(struct xfrm_state *x, struct sk_buff *skb) { int err; int nfrags; int ihl; u8 *icv; struct sk_buff *trailer; struct crypto_ahash *ahash; struct ahash_request *req; struct scatterlist *sg; struct iphdr *iph, *top_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; int seqhi_len = 0; __be32 *seqhi; int sglists = 0; struct scatterlist *seqhisg; ahp = x->data; ahash = ahp->ahash; if ((err = skb_cow_data(skb, 0, &trailer)) < 0) goto out; nfrags = err; skb_push(skb, -skb_network_offset(skb)); ah = ip_auth_hdr(skb); ihl = ip_hdrlen(skb); if (x->props.flags & XFRM_STATE_ESN) { sglists = 1; seqhi_len = sizeof(*seqhi); } err = -ENOMEM; iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len); if (!iph) goto out; seqhi = (__be32 *)((char *)iph + ihl); icv = ah_tmp_icv(ahash, seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); seqhisg = sg + nfrags; memset(ah->auth_data, 0, ahp->icv_trunc_len); top_iph = ip_hdr(skb); iph->tos = top_iph->tos; iph->ttl = top_iph->ttl; iph->frag_off = top_iph->frag_off; if (top_iph->ihl != 5) { iph->daddr = top_iph->daddr; memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); err = ip_clear_mutable_options(top_iph, &top_iph->daddr); if (err) goto out_free; } ah->nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_AH; top_iph->tos = 0; top_iph->tot_len = htons(skb->len); top_iph->frag_off = 0; top_iph->ttl = 0; top_iph->check = 0; if (x->props.flags & XFRM_STATE_ALIGN4) ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; else ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags + sglists); err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); if (unlikely(err < 0)) goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); sg_set_buf(seqhisg, seqhi, seqhi_len); } ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_output_done, skb); AH_SKB_CB(skb)->tmp = iph; err = crypto_ahash_digest(req); if (err) { if (err == -EINPROGRESS) goto out; if (err == -ENOSPC) err = NET_XMIT_DROP; goto out_free; } memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; top_iph->frag_off = iph->frag_off; if (top_iph->ihl != 5) { top_iph->daddr = iph->daddr; memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } out_free: kfree(iph); out: return err; } static void ah_input_done(struct crypto_async_request *base, int err) { u8 *auth_data; u8 *icv; struct iphdr *work_iph; struct sk_buff *skb = base->data; struct xfrm_state *x = xfrm_input_state(skb); struct ah_data *ahp = x->data; struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); int ah_hlen = (ah->hdrlen + 2) << 2; if (err) goto out; work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, ihl); icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out; err = ah->nexthdr; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); if (x->props.mode == XFRM_MODE_TUNNEL) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -ihl); out: kfree(AH_SKB_CB(skb)->tmp); xfrm_input_resume(skb, err); } static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; int ihl; int nexthdr; int nfrags; u8 *auth_data; u8 *icv; struct sk_buff *trailer; struct crypto_ahash *ahash; struct ahash_request *req; struct scatterlist *sg; struct iphdr *iph, *work_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; int err = -ENOMEM; int seqhi_len = 0; __be32 *seqhi; int sglists = 0; struct scatterlist *seqhisg; if (!pskb_may_pull(skb, sizeof(*ah))) goto out; ah = (struct ip_auth_hdr *)skb->data; ahp = x->data; ahash = ahp->ahash; nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; if (x->props.flags & XFRM_STATE_ALIGN4) { if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) && ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len)) goto out; } else { if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) goto out; } if (!pskb_may_pull(skb, ah_hlen)) goto out; /* We are going to _remove_ AH header to keep sockets happy, * so... Later this can change. */ if (skb_unclone(skb, GFP_ATOMIC)) goto out; skb->ip_summed = CHECKSUM_NONE; if ((err = skb_cow_data(skb, 0, &trailer)) < 0) goto out; nfrags = err; ah = (struct ip_auth_hdr *)skb->data; iph = ip_hdr(skb); ihl = ip_hdrlen(skb); if (x->props.flags & XFRM_STATE_ESN) { sglists = 1; seqhi_len = sizeof(*seqhi); } work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + ahp->icv_trunc_len + seqhi_len); if (!work_iph) { err = -ENOMEM; goto out; } seqhi = (__be32 *)((char *)work_iph + ihl); auth_data = ah_tmp_auth(seqhi, seqhi_len); icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); seqhisg = sg + nfrags; memcpy(work_iph, iph, ihl); memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); memset(ah->auth_data, 0, ahp->icv_trunc_len); iph->ttl = 0; iph->tos = 0; iph->frag_off = 0; iph->check = 0; if (ihl > sizeof(*iph)) { __be32 dummy; err = ip_clear_mutable_options(iph, &dummy); if (err) goto out_free; } skb_push(skb, ihl); sg_init_table(sg, nfrags + sglists); err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); if (unlikely(err < 0)) goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; sg_set_buf(seqhisg, seqhi, seqhi_len); } ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_input_done, skb); AH_SKB_CB(skb)->tmp = work_iph; err = crypto_ahash_digest(req); if (err) { if (err == -EINPROGRESS) goto out; goto out_free; } err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out_free; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); if (x->props.mode == XFRM_MODE_TUNNEL) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -ihl); err = nexthdr; out_free: kfree (work_iph); out: return err; } static int ah4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0); else ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0); xfrm_state_put(x); return 0; } static int ah_init_state(struct xfrm_state *x) { struct ah_data *ahp = NULL; struct xfrm_algo_desc *aalg_desc; struct crypto_ahash *ahash; if (!x->aalg) goto error; if (x->encap) goto error; ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); if (!ahp) return -ENOMEM; ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); if (IS_ERR(ahash)) goto error; ahp->ahash = ahash; if (crypto_ahash_setkey(ahash, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8)) goto error; /* * Lookup the algorithm description maintained by xfrm_algo, * verify crypto transform properties, and store information * we need for AH processing. This lookup cannot fail here * after a successful crypto_alloc_ahash(). */ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); BUG_ON(!aalg_desc); if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_ahash_digestsize(ahash)) { pr_info("%s: %s digestsize %u != %hu\n", __func__, x->aalg->alg_name, crypto_ahash_digestsize(ahash), aalg_desc->uinfo.auth.icv_fullbits / 8); goto error; } ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; if (x->props.flags & XFRM_STATE_ALIGN4) x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); else x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); x->data = ahp; return 0; error: if (ahp) { crypto_free_ahash(ahp->ahash); kfree(ahp); } return -EINVAL; } static void ah_destroy(struct xfrm_state *x) { struct ah_data *ahp = x->data; if (!ahp) return; crypto_free_ahash(ahp->ahash); kfree(ahp); } static int ah4_rcv_cb(struct sk_buff *skb, int err) { return 0; } static const struct xfrm_type ah_type = { .description = "AH4", .owner = THIS_MODULE, .proto = IPPROTO_AH, .flags = XFRM_TYPE_REPLAY_PROT, .init_state = ah_init_state, .destructor = ah_destroy, .input = ah_input, .output = ah_output }; static struct xfrm4_protocol ah4_protocol = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = ah4_rcv_cb, .err_handler = ah4_err, .priority = 0, }; static int __init ah4_init(void) { if (xfrm_register_type(&ah_type, AF_INET) < 0) { pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ah_type, AF_INET); return -EAGAIN; } return 0; } static void __exit ah4_fini(void) { if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); if (xfrm_unregister_type(&ah_type, AF_INET) < 0) pr_info("%s: can't remove xfrm type\n", __func__); } module_init(ah4_init); module_exit(ah4_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | /* * VMware vSockets Driver * * Copyright (C) 2007-2013 VMware, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation version 2 and no later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. */ #ifndef __AF_VSOCK_H__ #define __AF_VSOCK_H__ #include <linux/kernel.h> #include <linux/workqueue.h> #include <linux/vm_sockets.h> #include "vsock_addr.h" #define LAST_RESERVED_PORT 1023 #define VSOCK_HASH_SIZE 251 extern struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1]; extern struct list_head vsock_connected_table[VSOCK_HASH_SIZE]; extern spinlock_t vsock_table_lock; #define vsock_sk(__sk) ((struct vsock_sock *)__sk) #define sk_vsock(__vsk) (&(__vsk)->sk) struct vsock_sock { /* sk must be the first member. */ struct sock sk; struct sockaddr_vm local_addr; struct sockaddr_vm remote_addr; /* Links for the global tables of bound and connected sockets. */ struct list_head bound_table; struct list_head connected_table; /* Accessed without the socket lock held. This means it can never be * modified outsided of socket create or destruct. */ bool trusted; bool cached_peer_allow_dgram; /* Dgram communication allowed to * cached peer? */ u32 cached_peer; /* Context ID of last dgram destination check. */ const struct cred *owner; /* Rest are SOCK_STREAM only. */ long connect_timeout; /* Listening socket that this came from. */ struct sock *listener; /* Used for pending list and accept queue during connection handshake. * The listening socket is the head for both lists. Sockets created * for connection requests are placed in the pending list until they * are connected, at which point they are put in the accept queue list * so they can be accepted in accept(). If accept() cannot accept the * connection, it is marked as rejected so the cleanup function knows * to clean up the socket. */ struct list_head pending_links; struct list_head accept_queue; bool rejected; struct delayed_work connect_work; struct delayed_work pending_work; struct delayed_work close_work; bool close_work_scheduled; u32 peer_shutdown; bool sent_request; bool ignore_connecting_rst; /* Private to transport. */ void *trans; }; s64 vsock_stream_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_space(struct vsock_sock *vsk); struct sock *__vsock_create(struct net *net, struct socket *sock, struct sock *parent, gfp_t priority, unsigned short type, int kern); /**** TRANSPORT ****/ struct vsock_transport_recv_notify_data { u64 data1; /* Transport-defined. */ u64 data2; /* Transport-defined. */ bool notify_on_block; }; struct vsock_transport_send_notify_data { u64 data1; /* Transport-defined. */ u64 data2; /* Transport-defined. */ }; struct vsock_transport { /* Initialize/tear-down socket. */ int (*init)(struct vsock_sock *, struct vsock_sock *); void (*destruct)(struct vsock_sock *); void (*release)(struct vsock_sock *); /* Cancel all pending packets sent on vsock. */ int (*cancel_pkt)(struct vsock_sock *vsk); /* Connections. */ int (*connect)(struct vsock_sock *); /* DGRAM. */ int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *); int (*dgram_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, size_t len, int flags); int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *, struct msghdr *, size_t len); bool (*dgram_allow)(u32 cid, u32 port); /* STREAM. */ /* TODO: stream_bind() */ ssize_t (*stream_dequeue)(struct vsock_sock *, struct msghdr *, size_t len, int flags); ssize_t (*stream_enqueue)(struct vsock_sock *, struct msghdr *, size_t len); s64 (*stream_has_data)(struct vsock_sock *); s64 (*stream_has_space)(struct vsock_sock *); u64 (*stream_rcvhiwat)(struct vsock_sock *); bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); /* Notification. */ int (*notify_poll_in)(struct vsock_sock *, size_t, bool *); int (*notify_poll_out)(struct vsock_sock *, size_t, bool *); int (*notify_recv_init)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_pre_block)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t, struct vsock_transport_recv_notify_data *); int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t, ssize_t, bool, struct vsock_transport_recv_notify_data *); int (*notify_send_init)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_pre_block)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_pre_enqueue)(struct vsock_sock *, struct vsock_transport_send_notify_data *); int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t, struct vsock_transport_send_notify_data *); /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); /* Buffer sizes. */ void (*set_buffer_size)(struct vsock_sock *, u64); void (*set_min_buffer_size)(struct vsock_sock *, u64); void (*set_max_buffer_size)(struct vsock_sock *, u64); u64 (*get_buffer_size)(struct vsock_sock *); u64 (*get_min_buffer_size)(struct vsock_sock *); u64 (*get_max_buffer_size)(struct vsock_sock *); /* Addressing. */ u32 (*get_local_cid)(void); }; /**** CORE ****/ int __vsock_core_init(const struct vsock_transport *t, struct module *owner); static inline int vsock_core_init(const struct vsock_transport *t) { return __vsock_core_init(t, THIS_MODULE); } void vsock_core_exit(void); /* The transport may downcast this to access transport-specific functions */ const struct vsock_transport *vsock_core_get_transport(void); /**** UTILS ****/ /* vsock_table_lock must be held */ static inline bool __vsock_in_bound_table(struct vsock_sock *vsk) { return !list_empty(&vsk->bound_table); } /* vsock_table_lock must be held */ static inline bool __vsock_in_connected_table(struct vsock_sock *vsk) { return !list_empty(&vsk->connected_table); } void vsock_release_pending(struct sock *pending); void vsock_add_pending(struct sock *listener, struct sock *pending); void vsock_remove_pending(struct sock *listener, struct sock *pending); void vsock_enqueue_accept(struct sock *listener, struct sock *connected); void vsock_insert_connected(struct vsock_sock *vsk); void vsock_remove_bound(struct vsock_sock *vsk); void vsock_remove_connected(struct vsock_sock *vsk); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); void vsock_remove_sock(struct vsock_sock *vsk); void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); /**** TAP ****/ struct vsock_tap { struct net_device *dev; struct module *module; struct list_head list; }; int vsock_init_tap(void); int vsock_add_tap(struct vsock_tap *vt); int vsock_remove_tap(struct vsock_tap *vt); void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque); #endif /* __AF_VSOCK_H__ */ |
11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2003-2008 Takahiro Hirofuchi * Copyright (C) 2015 Nobuo Iwata */ #ifndef __USBIP_VHCI_H #define __USBIP_VHCI_H #include <linux/device.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/sysfs.h> #include <linux/types.h> #include <linux/usb.h> #include <linux/usb/hcd.h> #include <linux/wait.h> struct vhci_device { struct usb_device *udev; /* * devid specifies a remote usb device uniquely instead * of combination of busnum and devnum. */ __u32 devid; /* speed of a remote device */ enum usb_device_speed speed; /* vhci root-hub port to which this device is attached */ __u32 rhport; struct usbip_device ud; /* lock for the below link lists */ spinlock_t priv_lock; /* vhci_priv is linked to one of them. */ struct list_head priv_tx; struct list_head priv_rx; /* vhci_unlink is linked to one of them */ struct list_head unlink_tx; struct list_head unlink_rx; /* vhci_tx thread sleeps for this queue */ wait_queue_head_t waitq_tx; }; /* urb->hcpriv, use container_of() */ struct vhci_priv { unsigned long seqnum; struct list_head list; struct vhci_device *vdev; struct urb *urb; }; struct vhci_unlink { /* seqnum of this request */ unsigned long seqnum; struct list_head list; /* seqnum of the unlink target */ unsigned long unlink_seqnum; }; enum hub_speed { HUB_SPEED_HIGH = 0, HUB_SPEED_SUPER, }; /* Number of supported ports. Value has an upperbound of USB_MAXCHILDREN */ #ifdef CONFIG_USBIP_VHCI_HC_PORTS #define VHCI_HC_PORTS CONFIG_USBIP_VHCI_HC_PORTS #else #define VHCI_HC_PORTS 8 #endif /* Each VHCI has 2 hubs (USB2 and USB3), each has VHCI_HC_PORTS ports */ #define VHCI_PORTS (VHCI_HC_PORTS*2) #ifdef CONFIG_USBIP_VHCI_NR_HCS #define VHCI_NR_HCS CONFIG_USBIP_VHCI_NR_HCS #else #define VHCI_NR_HCS 1 #endif #define MAX_STATUS_NAME 16 struct vhci { spinlock_t lock; struct platform_device *pdev; struct vhci_hcd *vhci_hcd_hs; struct vhci_hcd *vhci_hcd_ss; }; /* for usb_hcd.hcd_priv[0] */ struct vhci_hcd { struct vhci *vhci; u32 port_status[VHCI_HC_PORTS]; unsigned resuming:1; unsigned long re_timeout; atomic_t seqnum; /* * NOTE: * wIndex shows the port number and begins from 1. * But, the index of this array begins from 0. */ struct vhci_device vdev[VHCI_HC_PORTS]; }; extern int vhci_num_controllers; extern struct vhci *vhcis; extern struct attribute_group vhci_attr_group; /* vhci_hcd.c */ void rh_port_connect(struct vhci_device *vdev, enum usb_device_speed speed); /* vhci_sysfs.c */ int vhci_init_attr_group(void); void vhci_finish_attr_group(void); /* vhci_rx.c */ struct urb *pickup_urb_and_free_priv(struct vhci_device *vdev, __u32 seqnum); int vhci_rx_loop(void *data); /* vhci_tx.c */ int vhci_tx_loop(void *data); static inline __u32 port_to_rhport(__u32 port) { return port % VHCI_HC_PORTS; } static inline int port_to_pdev_nr(__u32 port) { return port / VHCI_PORTS; } static inline struct vhci_hcd *hcd_to_vhci_hcd(struct usb_hcd *hcd) { return (struct vhci_hcd *) (hcd->hcd_priv); } static inline struct device *hcd_dev(struct usb_hcd *hcd) { return (hcd)->self.controller; } static inline const char *hcd_name(struct usb_hcd *hcd) { return (hcd)->self.bus_name; } static inline struct usb_hcd *vhci_hcd_to_hcd(struct vhci_hcd *vhci_hcd) { return container_of((void *) vhci_hcd, struct usb_hcd, hcd_priv); } static inline struct vhci_hcd *vdev_to_vhci_hcd(struct vhci_device *vdev) { return container_of((void *)(vdev - vdev->rhport), struct vhci_hcd, vdev); } #endif /* __USBIP_VHCI_H */ |
1 3 3 3 3 7 6 5 4 5 7 1 10 15 14 13 12 5 11 15 3 7 6 5 4 3 6 1 5 4 4 3 3 3 1 3 2 3 3 3 5 4 4 1 4 1 4 1 4 4 5 3 2 2 179 179 179 72 1 1 7 9 9 8 9 7 3 3 2 1 1 1 7 7 1 7 3 3 3 2 1 14 13 1 13 2 3 3 3 13 1 13 14 8 8 7 7 7 6 7 7 1 1 2 2 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 2 3 2 1 1 2 1 2 3 3 2 1 5 5 5 4 3 2 1 1 1 1 1 1 1 1 8 7 7 7 7 7 2 7 6 6 6 6 6 6 6 1 1 13 3 2 2 3 3 3 88 1 2 2 1 1 5 5 4 1 4 2 2 2 2 1 1 1 1 1 1 3 3 5 5 1 3 5 1 1 167 165 5 159 162 71 162 163 7 3 14 4 4 2 3 1 1 2 1 2 8 1 1 1 1 3 7 1 1 5 9 1 1 163 160 167 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 | /* * net/core/ethtool.c - Ethtool ioctl handler * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx> * * This file is where we call all the ethtool_ops commands to get * the information ethtool needs. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #include <linux/module.h> #include <linux/types.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/ethtool.h> #include <linux/netdevice.h> #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/bitops.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/sfp.h> #include <linux/slab.h> #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/net.h> /* * Some useful ethtool_ops methods that're device independent. * If we find that all drivers want to do the same thing here, * we can turn these into dev_() function calls. */ u32 ethtool_op_get_link(struct net_device *dev) { return netif_carrier_ok(dev) ? 1 : 0; } EXPORT_SYMBOL(ethtool_op_get_link); int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) { info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE | SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; info->phc_index = -1; return 0; } EXPORT_SYMBOL(ethtool_op_get_ts_info); /* Handlers for each ethtool command */ #define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32) static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_SG_BIT] = "tx-scatter-gather", [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4", [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic", [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6", [NETIF_F_HIGHDMA_BIT] = "highdma", [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist", [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert", [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse", [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter", [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert", [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse", [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", [NETIF_F_GSO_BIT] = "tx-generic-segmentation", [NETIF_F_LLTX_BIT] = "tx-lockless", [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", [NETIF_F_GRO_BIT] = "rx-gro", [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", [NETIF_F_LRO_BIT] = "rx-lro", [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation", [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation", [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu", [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter", [NETIF_F_RXHASH_BIT] = "rx-hashing", [NETIF_F_RXCSUM_BIT] = "rx-checksum", [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy", [NETIF_F_LOOPBACK_BIT] = "loopback", [NETIF_F_RXFCS_BIT] = "rx-fcs", [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", [NETIF_F_HW_TC_BIT] = "hw-tc-offload", [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload", [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record", [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload", [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload", }; static const char rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { [ETH_RSS_HASH_TOP_BIT] = "toeplitz", [ETH_RSS_HASH_XOR_BIT] = "xor", [ETH_RSS_HASH_CRC32_BIT] = "crc32", }; static const char tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = { [ETHTOOL_ID_UNSPEC] = "Unspec", [ETHTOOL_RX_COPYBREAK] = "rx-copybreak", [ETHTOOL_TX_COPYBREAK] = "tx-copybreak", [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout", }; static const char phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = { [ETHTOOL_ID_UNSPEC] = "Unspec", [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift", }; static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { .cmd = ETHTOOL_GFEATURES, .size = ETHTOOL_DEV_FEATURE_WORDS, }; struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; u32 __user *sizeaddr; u32 copy_size; int i; /* in case feature bits run out again */ BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t)); for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { features[i].available = (u32)(dev->hw_features >> (32 * i)); features[i].requested = (u32)(dev->wanted_features >> (32 * i)); features[i].active = (u32)(dev->features >> (32 * i)); features[i].never_changed = (u32)(NETIF_F_NEVER_CHANGE >> (32 * i)); } sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); if (get_user(copy_size, sizeaddr)) return -EFAULT; if (copy_size > ETHTOOL_DEV_FEATURE_WORDS) copy_size = ETHTOOL_DEV_FEATURE_WORDS; if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; useraddr += sizeof(cmd); if (copy_to_user(useraddr, features, copy_size * sizeof(*features))) return -EFAULT; return 0; } static int ethtool_set_features(struct net_device *dev, void __user *useraddr) { struct ethtool_sfeatures cmd; struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; netdev_features_t wanted = 0, valid = 0; int i, ret = 0; if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; useraddr += sizeof(cmd); if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS) return -EINVAL; if (copy_from_user(features, useraddr, sizeof(features))) return -EFAULT; for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { valid |= (netdev_features_t)features[i].valid << (32 * i); wanted |= (netdev_features_t)features[i].requested << (32 * i); } if (valid & ~NETIF_F_ETHTOOL_BITS) return -EINVAL; if (valid & ~dev->hw_features) { valid &= dev->hw_features; ret |= ETHTOOL_F_UNSUPPORTED; } dev->wanted_features &= ~valid; dev->wanted_features |= wanted & valid; __netdev_update_features(dev); if ((dev->wanted_features ^ dev->features) & valid) ret |= ETHTOOL_F_WISH; return ret; } static int __ethtool_get_sset_count(struct net_device *dev, int sset) { const struct ethtool_ops *ops = dev->ethtool_ops; if (sset == ETH_SS_FEATURES) return ARRAY_SIZE(netdev_features_strings); if (sset == ETH_SS_RSS_HASH_FUNCS) return ARRAY_SIZE(rss_hash_func_strings); if (sset == ETH_SS_TUNABLES) return ARRAY_SIZE(tunable_strings); if (sset == ETH_SS_PHY_TUNABLES) return ARRAY_SIZE(phy_tunable_strings); if (sset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats) return phy_ethtool_get_sset_count(dev->phydev); if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else return -EOPNOTSUPP; } static void __ethtool_get_strings(struct net_device *dev, u32 stringset, u8 *data) { const struct ethtool_ops *ops = dev->ethtool_ops; if (stringset == ETH_SS_FEATURES) memcpy(data, netdev_features_strings, sizeof(netdev_features_strings)); else if (stringset == ETH_SS_RSS_HASH_FUNCS) memcpy(data, rss_hash_func_strings, sizeof(rss_hash_func_strings)); else if (stringset == ETH_SS_TUNABLES) memcpy(data, tunable_strings, sizeof(tunable_strings)); else if (stringset == ETH_SS_PHY_TUNABLES) memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); else if (stringset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats) phy_ethtool_get_strings(dev->phydev, data); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); } static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) { /* feature masks of legacy discrete ethtool ops */ switch (eth_cmd) { case ETHTOOL_GTXCSUM: case ETHTOOL_STXCSUM: return NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC; case ETHTOOL_GRXCSUM: case ETHTOOL_SRXCSUM: return NETIF_F_RXCSUM; case ETHTOOL_GSG: case ETHTOOL_SSG: return NETIF_F_SG; case ETHTOOL_GTSO: case ETHTOOL_STSO: return NETIF_F_ALL_TSO; case ETHTOOL_GGSO: case ETHTOOL_SGSO: return NETIF_F_GSO; case ETHTOOL_GGRO: case ETHTOOL_SGRO: return NETIF_F_GRO; default: BUG(); } } static int ethtool_get_one_feature(struct net_device *dev, char __user *useraddr, u32 ethcmd) { netdev_features_t mask = ethtool_get_feature_mask(ethcmd); struct ethtool_value edata = { .cmd = ethcmd, .data = !!(dev->features & mask), }; if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_set_one_feature(struct net_device *dev, void __user *useraddr, u32 ethcmd) { struct ethtool_value edata; netdev_features_t mask; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; mask = ethtool_get_feature_mask(ethcmd); mask &= dev->hw_features; if (!mask) return -EOPNOTSUPP; if (edata.data) dev->wanted_features |= mask; else dev->wanted_features &= ~mask; __netdev_update_features(dev); return 0; } #define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) #define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ NETIF_F_RXHASH) static u32 __ethtool_get_flags(struct net_device *dev) { u32 flags = 0; if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO; if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) flags |= ETH_FLAG_RXVLAN; if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) flags |= ETH_FLAG_TXVLAN; if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE; if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH; return flags; } static int __ethtool_set_flags(struct net_device *dev, u32 data) { netdev_features_t features = 0, changed; if (data & ~ETH_ALL_FLAGS) return -EINVAL; if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO; if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_CTAG_RX; if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_CTAG_TX; if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE; if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH; /* allow changing only bits set in hw_features */ changed = (features ^ dev->features) & ETH_ALL_FEATURES; if (changed & ~dev->hw_features) return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; dev->wanted_features = (dev->wanted_features & ~changed) | (features & changed); __netdev_update_features(dev); return 0; } /* Given two link masks, AND them together and save the result in dst. */ void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst, struct ethtool_link_ksettings *src) { unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS); unsigned int idx = 0; for (; idx < size; idx++) { dst->link_modes.supported[idx] &= src->link_modes.supported[idx]; dst->link_modes.advertising[idx] &= src->link_modes.advertising[idx]; } } EXPORT_SYMBOL(ethtool_intersect_link_masks); void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32) { bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); dst[0] = legacy_u32; } EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode); /* return false if src had higher bits set. lower bits always updated. */ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, const unsigned long *src) { bool retval = true; /* TODO: following test will soon always be true */ if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) { __ETHTOOL_DECLARE_LINK_MODE_MASK(ext); bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_fill(ext, 32); bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS); if (bitmap_intersects(ext, src, __ETHTOOL_LINK_MODE_MASK_NBITS)) { /* src mask goes beyond bit 31 */ retval = false; } } *legacy_u32 = src[0]; return retval; } EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32); /* return false if legacy contained non-0 deprecated fields * maxtxpkt/maxrxpkt. rest of ksettings always updated */ static bool convert_legacy_settings_to_link_ksettings( struct ethtool_link_ksettings *link_ksettings, const struct ethtool_cmd *legacy_settings) { bool retval = true; memset(link_ksettings, 0, sizeof(*link_ksettings)); /* This is used to tell users that driver is still using these * deprecated legacy fields, and they should not use * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS */ if (legacy_settings->maxtxpkt || legacy_settings->maxrxpkt) retval = false; ethtool_convert_legacy_u32_to_link_mode( link_ksettings->link_modes.supported, legacy_settings->supported); ethtool_convert_legacy_u32_to_link_mode( link_ksettings->link_modes.advertising, legacy_settings->advertising); ethtool_convert_legacy_u32_to_link_mode( link_ksettings->link_modes.lp_advertising, legacy_settings->lp_advertising); link_ksettings->base.speed = ethtool_cmd_speed(legacy_settings); link_ksettings->base.duplex = legacy_settings->duplex; link_ksettings->base.port = legacy_settings->port; link_ksettings->base.phy_address = legacy_settings->phy_address; link_ksettings->base.autoneg = legacy_settings->autoneg; link_ksettings->base.mdio_support = legacy_settings->mdio_support; link_ksettings->base.eth_tp_mdix = legacy_settings->eth_tp_mdix; link_ksettings->base.eth_tp_mdix_ctrl = legacy_settings->eth_tp_mdix_ctrl; return retval; } /* return false if ksettings link modes had higher bits * set. legacy_settings always updated (best effort) */ static bool convert_link_ksettings_to_legacy_settings( struct ethtool_cmd *legacy_settings, const struct ethtool_link_ksettings *link_ksettings) { bool retval = true; memset(legacy_settings, 0, sizeof(*legacy_settings)); /* this also clears the deprecated fields in legacy structure: * __u8 transceiver; * __u32 maxtxpkt; * __u32 maxrxpkt; */ retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->supported, link_ksettings->link_modes.supported); retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->advertising, link_ksettings->link_modes.advertising); retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->lp_advertising, link_ksettings->link_modes.lp_advertising); ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed); legacy_settings->duplex = link_ksettings->base.duplex; legacy_settings->port = link_ksettings->base.port; legacy_settings->phy_address = link_ksettings->base.phy_address; legacy_settings->autoneg = link_ksettings->base.autoneg; legacy_settings->mdio_support = link_ksettings->base.mdio_support; legacy_settings->eth_tp_mdix = link_ksettings->base.eth_tp_mdix; legacy_settings->eth_tp_mdix_ctrl = link_ksettings->base.eth_tp_mdix_ctrl; legacy_settings->transceiver = link_ksettings->base.transceiver; return retval; } /* number of 32-bit words to store the user's link mode bitmaps */ #define __ETHTOOL_LINK_MODE_MASK_NU32 \ DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) /* layout of the struct passed from/to userland */ struct ethtool_link_usettings { struct ethtool_link_settings base; struct { __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32]; __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; } link_modes; }; /* Internal kernel helper to query a device ethtool_link_settings. * * Backward compatibility note: for compatibility with legacy drivers * that implement only the ethtool_cmd API, this has to work with both * drivers implementing get_link_ksettings API and drivers * implementing get_settings API. When drivers implement get_settings * and report ethtool_cmd deprecated fields * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored * because the resulting struct ethtool_link_settings does not report them. */ int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings) { int err; struct ethtool_cmd cmd; ASSERT_RTNL(); if (dev->ethtool_ops->get_link_ksettings) { memset(link_ksettings, 0, sizeof(*link_ksettings)); return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); } /* driver doesn't support %ethtool_link_ksettings API. revert to * legacy %ethtool_cmd API, unless it's not supported either. * TODO: remove when ethtool_ops::get_settings disappears internally */ if (!dev->ethtool_ops->get_settings) return -EOPNOTSUPP; memset(&cmd, 0, sizeof(cmd)); cmd.cmd = ETHTOOL_GSET; err = dev->ethtool_ops->get_settings(dev, &cmd); if (err < 0) return err; /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt */ convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd); return err; } EXPORT_SYMBOL(__ethtool_get_link_ksettings); /* convert ethtool_link_usettings in user space to a kernel internal * ethtool_link_ksettings. return 0 on success, errno on error. */ static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to, const void __user *from) { struct ethtool_link_usettings link_usettings; if (copy_from_user(&link_usettings, from, sizeof(link_usettings))) return -EFAULT; memcpy(&to->base, &link_usettings.base, sizeof(to->base)); bitmap_from_arr32(to->link_modes.supported, link_usettings.link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_from_arr32(to->link_modes.advertising, link_usettings.link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_from_arr32(to->link_modes.lp_advertising, link_usettings.link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); return 0; } /* convert a kernel internal ethtool_link_ksettings to * ethtool_link_usettings in user space. return 0 on success, errno on * error. */ static int store_link_ksettings_for_user(void __user *to, const struct ethtool_link_ksettings *from) { struct ethtool_link_usettings link_usettings; memcpy(&link_usettings, from, sizeof(link_usettings)); bitmap_to_arr32(link_usettings.link_modes.supported, from->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_to_arr32(link_usettings.link_modes.advertising, from->link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_to_arr32(link_usettings.link_modes.lp_advertising, from->link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); if (copy_to_user(to, &link_usettings, sizeof(link_usettings))) return -EFAULT; return 0; } /* Query device for its ethtool_link_settings. * * Backward compatibility note: this function must fail when driver * does not implement ethtool::get_link_ksettings, even if legacy * ethtool_ops::get_settings is implemented. This tells new versions * of ethtool that they should use the legacy API %ETHTOOL_GSET for * this driver, so that they can correctly access the ethtool_cmd * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver * implements ethtool_ops::get_settings anymore. */ static int ethtool_get_link_ksettings(struct net_device *dev, void __user *useraddr) { int err = 0; struct ethtool_link_ksettings link_ksettings; ASSERT_RTNL(); if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; /* handle bitmap nbits handshake */ if (copy_from_user(&link_ksettings.base, useraddr, sizeof(link_ksettings.base))) return -EFAULT; if (__ETHTOOL_LINK_MODE_MASK_NU32 != link_ksettings.base.link_mode_masks_nwords) { /* wrong link mode nbits requested */ memset(&link_ksettings, 0, sizeof(link_ksettings)); link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; /* send back number of words required as negative val */ compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX, "need too many bits for link modes!"); link_ksettings.base.link_mode_masks_nwords = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32); /* copy the base fields back to user, not the link * mode bitmaps */ if (copy_to_user(useraddr, &link_ksettings.base, sizeof(link_ksettings.base))) return -EFAULT; return 0; } /* handshake successful: user/kernel agree on * link_mode_masks_nwords */ memset(&link_ksettings, 0, sizeof(link_ksettings)); err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); if (err < 0) return err; /* make sure we tell the right values to user */ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; return store_link_ksettings_for_user(useraddr, &link_ksettings); } /* Update device ethtool_link_settings. * * Backward compatibility note: this function must fail when driver * does not implement ethtool::set_link_ksettings, even if legacy * ethtool_ops::set_settings is implemented. This tells new versions * of ethtool that they should use the legacy API %ETHTOOL_SSET for * this driver, so that they can correctly update the ethtool_cmd * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver * implements ethtool_ops::get_settings anymore. */ static int ethtool_set_link_ksettings(struct net_device *dev, void __user *useraddr) { int err; struct ethtool_link_ksettings link_ksettings; ASSERT_RTNL(); if (!dev->ethtool_ops->set_link_ksettings) return -EOPNOTSUPP; /* make sure nbits field has expected value */ if (copy_from_user(&link_ksettings.base, useraddr, sizeof(link_ksettings.base))) return -EFAULT; if (__ETHTOOL_LINK_MODE_MASK_NU32 != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; /* copy the whole structure, now that we know it has expected * format */ err = load_link_ksettings_from_user(&link_ksettings, useraddr); if (err) return err; /* re-check nwords field, just in case */ if (__ETHTOOL_LINK_MODE_MASK_NU32 != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); } /* Query device for its ethtool_cmd settings. * * Backward compatibility note: for compatibility with legacy ethtool, * this has to work with both drivers implementing get_link_ksettings * API and drivers implementing get_settings API. When drivers * implement get_link_ksettings and report higher link mode bits, a * kernel warning is logged once (with name of 1st driver/device) to * recommend user to upgrade ethtool, but the command is successful * (only the lower link mode bits reported back to user). */ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_cmd cmd; ASSERT_RTNL(); if (dev->ethtool_ops->get_link_ksettings) { /* First, use link_ksettings API if it is supported */ int err; struct ethtool_link_ksettings link_ksettings; memset(&link_ksettings, 0, sizeof(link_ksettings)); err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); if (err < 0) return err; convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings); /* send a sensible cmd tag back to user */ cmd.cmd = ETHTOOL_GSET; } else { /* driver doesn't support %ethtool_link_ksettings * API. revert to legacy %ethtool_cmd API, unless it's * not supported either. */ int err; if (!dev->ethtool_ops->get_settings) return -EOPNOTSUPP; memset(&cmd, 0, sizeof(cmd)); cmd.cmd = ETHTOOL_GSET; err = dev->ethtool_ops->get_settings(dev, &cmd); if (err < 0) return err; } if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; return 0; } /* Update device link settings with given ethtool_cmd. * * Backward compatibility note: for compatibility with legacy ethtool, * this has to work with both drivers implementing set_link_ksettings * API and drivers implementing set_settings API. When drivers * implement set_link_ksettings and user's request updates deprecated * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel * warning is logged once (with name of 1st driver/device) to * recommend user to upgrade ethtool, and the request is rejected. */ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_cmd cmd; ASSERT_RTNL(); if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; /* first, try new %ethtool_link_ksettings API. */ if (dev->ethtool_ops->set_link_ksettings) { struct ethtool_link_ksettings link_ksettings; if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd)) return -EINVAL; link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); } /* legacy %ethtool_cmd API */ /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings * disappears internally */ if (!dev->ethtool_ops->set_settings) return -EOPNOTSUPP; return dev->ethtool_ops->set_settings(dev, &cmd); } static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) { struct ethtool_drvinfo info; const struct ethtool_ops *ops = dev->ethtool_ops; memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GDRVINFO; if (ops->get_drvinfo) { ops->get_drvinfo(dev, &info); } else if (dev->dev.parent && dev->dev.parent->driver) { strlcpy(info.bus_info, dev_name(dev->dev.parent), sizeof(info.bus_info)); strlcpy(info.driver, dev->dev.parent->driver->name, sizeof(info.driver)); } else { return -EOPNOTSUPP; } /* * this method of obtaining string set info is deprecated; * Use ETHTOOL_GSSET_INFO instead. */ if (ops->get_sset_count) { int rc; rc = ops->get_sset_count(dev, ETH_SS_TEST); if (rc >= 0) info.testinfo_len = rc; rc = ops->get_sset_count(dev, ETH_SS_STATS); if (rc >= 0) info.n_stats = rc; rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (rc >= 0) info.n_priv_flags = rc; } if (ops->get_regs_len) { int ret = ops->get_regs_len(dev); if (ret > 0) info.regdump_len = ret; } if (ops->get_eeprom_len) info.eedump_len = ops->get_eeprom_len(dev); if (copy_to_user(useraddr, &info, sizeof(info))) return -EFAULT; return 0; } static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, void __user *useraddr) { struct ethtool_sset_info info; u64 sset_mask; int i, idx = 0, n_bits = 0, ret, rc; u32 *info_buf = NULL; if (copy_from_user(&info, useraddr, sizeof(info))) return -EFAULT; /* store copy of mask, because we zero struct later on */ sset_mask = info.sset_mask; if (!sset_mask) return 0; /* calculate size of return buffer */ n_bits = hweight64(sset_mask); memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GSSET_INFO; info_buf = kcalloc(n_bits, sizeof(u32), GFP_USER); if (!info_buf) return -ENOMEM; /* * fill return buffer based on input bitmask and successful * get_sset_count return */ for (i = 0; i < 64; i++) { if (!(sset_mask & (1ULL << i))) continue; rc = __ethtool_get_sset_count(dev, i); if (rc >= 0) { info.sset_mask |= (1ULL << i); info_buf[idx++] = rc; } } ret = -EFAULT; if (copy_to_user(useraddr, &info, sizeof(info))) goto out; useraddr += offsetof(struct ethtool_sset_info, data); if (copy_to_user(useraddr, info_buf, idx * sizeof(u32))) goto out; ret = 0; out: kfree(info_buf); return ret; } static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { struct ethtool_rxnfc info; size_t info_size = sizeof(info); int rc; if (!dev->ethtool_ops->set_rxnfc) return -EOPNOTSUPP; /* struct ethtool_rxnfc was originally defined for * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data * members. User-space might still be using that * definition. */ if (cmd == ETHTOOL_SRXFH) info_size = (offsetof(struct ethtool_rxnfc, data) + sizeof(info.data)); if (copy_from_user(&info, useraddr, info_size)) return -EFAULT; rc = dev->ethtool_ops->set_rxnfc(dev, &info); if (rc) return rc; if (cmd == ETHTOOL_SRXCLSRLINS && copy_to_user(useraddr, &info, info_size)) return -EFAULT; return 0; } static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { struct ethtool_rxnfc info; size_t info_size = sizeof(info); const struct ethtool_ops *ops = dev->ethtool_ops; int ret; void *rule_buf = NULL; if (!ops->get_rxnfc) return -EOPNOTSUPP; /* struct ethtool_rxnfc was originally defined for * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data * members. User-space might still be using that * definition. */ if (cmd == ETHTOOL_GRXFH) info_size = (offsetof(struct ethtool_rxnfc, data) + sizeof(info.data)); if (copy_from_user(&info, useraddr, info_size)) return -EFAULT; /* If FLOW_RSS was requested then user-space must be using the * new definition, as FLOW_RSS is newer. */ if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { info_size = sizeof(info); if (copy_from_user(&info, useraddr, info_size)) return -EFAULT; /* Since malicious users may modify the original data, * we need to check whether FLOW_RSS is still requested. */ if (!(info.flow_type & FLOW_RSS)) return -EINVAL; } if (info.cmd != cmd) return -EINVAL; if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) rule_buf = kcalloc(info.rule_cnt, sizeof(u32), GFP_USER); if (!rule_buf) return -ENOMEM; } } ret = ops->get_rxnfc(dev, &info, rule_buf); if (ret < 0) goto err_out; ret = -EFAULT; if (copy_to_user(useraddr, &info, info_size)) goto err_out; if (rule_buf) { useraddr += offsetof(struct ethtool_rxnfc, rule_locs); if (copy_to_user(useraddr, rule_buf, info.rule_cnt * sizeof(u32))) goto err_out; } ret = 0; err_out: kfree(rule_buf); return ret; } static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, struct ethtool_rxnfc *rx_rings, u32 size) { int i; if (copy_from_user(indir, useraddr, size * sizeof(indir[0]))) return -EFAULT; /* Validate ring indices */ for (i = 0; i < size; i++) if (indir[i] >= rx_rings->data) return -EINVAL; return 0; } u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; void netdev_rss_key_fill(void *buffer, size_t len) { BUG_ON(len > sizeof(netdev_rss_key)); net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key)); memcpy(buffer, netdev_rss_key, len); } EXPORT_SYMBOL(netdev_rss_key_fill); static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) { u32 dev_size, current_max = 0; u32 *indir; int ret; if (!dev->ethtool_ops->get_rxfh_indir_size || !dev->ethtool_ops->get_rxfh) return -EOPNOTSUPP; dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); if (dev_size == 0) return -EOPNOTSUPP; indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); if (!indir) return -ENOMEM; ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); if (ret) goto out; while (dev_size--) current_max = max(current_max, indir[dev_size]); *max = current_max; out: kfree(indir); return ret; } static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { u32 user_size, dev_size; u32 *indir; int ret; if (!dev->ethtool_ops->get_rxfh_indir_size || !dev->ethtool_ops->get_rxfh) return -EOPNOTSUPP; dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev); if (dev_size == 0) return -EOPNOTSUPP; if (copy_from_user(&user_size, useraddr + offsetof(struct ethtool_rxfh_indir, size), sizeof(user_size))) return -EFAULT; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size), &dev_size, sizeof(dev_size))) return -EFAULT; /* If the user buffer size is 0, this is just a query for the * device table size. Otherwise, if it's smaller than the * device table size it's an error. */ if (user_size < dev_size) return user_size == 0 ? 0 : -EINVAL; indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); if (!indir) return -ENOMEM; ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL); if (ret) goto out; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, ring_index[0]), indir, dev_size * sizeof(indir[0]))) ret = -EFAULT; out: kfree(indir); return ret; } static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, void __user *useraddr) { struct ethtool_rxnfc rx_rings; u32 user_size, dev_size, i; u32 *indir; const struct ethtool_ops *ops = dev->ethtool_ops; int ret; u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); if (!ops->get_rxfh_indir_size || !ops->set_rxfh || !ops->get_rxnfc) return -EOPNOTSUPP; dev_size = ops->get_rxfh_indir_size(dev); if (dev_size == 0) return -EOPNOTSUPP; if (copy_from_user(&user_size, useraddr + offsetof(struct ethtool_rxfh_indir, size), sizeof(user_size))) return -EFAULT; if (user_size != 0 && user_size != dev_size) return -EINVAL; indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER); if (!indir) return -ENOMEM; rx_rings.cmd = ETHTOOL_GRXRINGS; ret = ops->get_rxnfc(dev, &rx_rings, NULL); if (ret) goto out; if (user_size == 0) { for (i = 0; i < dev_size; i++) indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); } else { ret = ethtool_copy_validate_indir(indir, useraddr + ringidx_offset, &rx_rings, dev_size); if (ret) goto out; } ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE); if (ret) goto out; /* indicate whether rxfh was set to default */ if (user_size == 0) dev->priv_flags &= ~IFF_RXFH_CONFIGURED; else dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(indir); return ret; } static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, void __user *useraddr) { int ret; const struct ethtool_ops *ops = dev->ethtool_ops; u32 user_indir_size, user_key_size; u32 dev_indir_size = 0, dev_key_size = 0; struct ethtool_rxfh rxfh; u32 total_size; u32 indir_bytes; u32 *indir = NULL; u8 dev_hfunc = 0; u8 *hkey = NULL; u8 *rss_config; if (!ops->get_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) dev_indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) dev_key_size = ops->get_rxfh_key_size(dev); if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; user_indir_size = rxfh.indir_size; user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ if (rxfh.rss_context && !ops->get_rxfh_context) return -EOPNOTSUPP; rxfh.indir_size = dev_indir_size; rxfh.key_size = dev_key_size; if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) return -EFAULT; if ((user_indir_size && (user_indir_size != dev_indir_size)) || (user_key_size && (user_key_size != dev_key_size))) return -EINVAL; indir_bytes = user_indir_size * sizeof(indir[0]); total_size = indir_bytes + user_key_size; rss_config = kzalloc(total_size, GFP_USER); if (!rss_config) return -ENOMEM; if (user_indir_size) indir = (u32 *)rss_config; if (user_key_size) hkey = rss_config + indir_bytes; if (rxfh.rss_context) ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey, &dev_hfunc, rxfh.rss_context); else ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc); if (ret) goto out; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc), &dev_hfunc, sizeof(rxfh.hfunc))) { ret = -EFAULT; } else if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_config[0]), rss_config, total_size)) { ret = -EFAULT; } out: kfree(rss_config); return ret; } static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, void __user *useraddr) { int ret; const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxnfc rx_rings; struct ethtool_rxfh rxfh; u32 dev_indir_size = 0, dev_key_size = 0, i; u32 *indir = NULL, indir_bytes = 0; u8 *hkey = NULL; u8 *rss_config; u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); bool delete = false; if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) dev_indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) dev_key_size = ops->get_rxfh_key_size(dev); if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; /* Check that reserved fields are 0 for now */ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ if (rxfh.rss_context && !ops->set_rxfh_context) return -EOPNOTSUPP; /* If either indir, hash key or function is valid, proceed further. * Must request at least one change: indir size, hash key or function. */ if ((rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && rxfh.indir_size != dev_indir_size) || (rxfh.key_size && (rxfh.key_size != dev_key_size)) || (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE)) return -EINVAL; if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) indir_bytes = dev_indir_size * sizeof(indir[0]); rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER); if (!rss_config) return -ENOMEM; rx_rings.cmd = ETHTOOL_GRXRINGS; ret = ops->get_rxnfc(dev, &rx_rings, NULL); if (ret) goto out; /* rxfh.indir_size == 0 means reset the indir table to default (master * context) or delete the context (other RSS contexts). * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. */ if (rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { indir = (u32 *)rss_config; ret = ethtool_copy_validate_indir(indir, useraddr + rss_cfg_offset, &rx_rings, rxfh.indir_size); if (ret) goto out; } else if (rxfh.indir_size == 0) { if (rxfh.rss_context == 0) { indir = (u32 *)rss_config; for (i = 0; i < dev_indir_size; i++) indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); } else { delete = true; } } if (rxfh.key_size) { hkey = rss_config + indir_bytes; if (copy_from_user(hkey, useraddr + rss_cfg_offset + indir_bytes, rxfh.key_size)) { ret = -EFAULT; goto out; } } if (rxfh.rss_context) ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc, &rxfh.rss_context, delete); else ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc); if (ret) goto out; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context), &rxfh.rss_context, sizeof(rxfh.rss_context))) ret = -EFAULT; if (!rxfh.rss_context) { /* indicate whether rxfh was set to default */ if (rxfh.indir_size == 0) dev->priv_flags &= ~IFF_RXFH_CONFIGURED; else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) dev->priv_flags |= IFF_RXFH_CONFIGURED; } out: kfree(rss_config); return ret; } static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) { struct ethtool_regs regs; const struct ethtool_ops *ops = dev->ethtool_ops; void *regbuf; int reglen, ret; if (!ops->get_regs || !ops->get_regs_len) return -EOPNOTSUPP; if (copy_from_user(®s, useraddr, sizeof(regs))) return -EFAULT; reglen = ops->get_regs_len(dev); if (reglen <= 0) return reglen; if (regs.len > reglen) regs.len = reglen; regbuf = NULL; if (reglen) { regbuf = vzalloc(reglen); if (!regbuf) return -ENOMEM; } if (regs.len < reglen) reglen = regs.len; ops->get_regs(dev, ®s, regbuf); ret = -EFAULT; if (copy_to_user(useraddr, ®s, sizeof(regs))) goto out; useraddr += offsetof(struct ethtool_regs, data); if (copy_to_user(useraddr, regbuf, reglen)) goto out; ret = 0; out: vfree(regbuf); return ret; } static int ethtool_reset(struct net_device *dev, char __user *useraddr) { struct ethtool_value reset; int ret; if (!dev->ethtool_ops->reset) return -EOPNOTSUPP; if (copy_from_user(&reset, useraddr, sizeof(reset))) return -EFAULT; ret = dev->ethtool_ops->reset(dev, &reset.data); if (ret) return ret; if (copy_to_user(useraddr, &reset, sizeof(reset))) return -EFAULT; return 0; } static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol; if (!dev->ethtool_ops->get_wol) return -EOPNOTSUPP; memset(&wol, 0, sizeof(struct ethtool_wolinfo)); wol.cmd = ETHTOOL_GWOL; dev->ethtool_ops->get_wol(dev, &wol); if (copy_to_user(useraddr, &wol, sizeof(wol))) return -EFAULT; return 0; } static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol; int ret; if (!dev->ethtool_ops->set_wol) return -EOPNOTSUPP; if (copy_from_user(&wol, useraddr, sizeof(wol))) return -EFAULT; ret = dev->ethtool_ops->set_wol(dev, &wol); if (ret) return ret; dev->wol_enabled = !!wol.wolopts; return 0; } static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) { struct ethtool_eee edata; int rc; if (!dev->ethtool_ops->get_eee) return -EOPNOTSUPP; memset(&edata, 0, sizeof(struct ethtool_eee)); edata.cmd = ETHTOOL_GEEE; rc = dev->ethtool_ops->get_eee(dev, &edata); if (rc) return rc; if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) { struct ethtool_eee edata; if (!dev->ethtool_ops->set_eee) return -EOPNOTSUPP; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; return dev->ethtool_ops->set_eee(dev, &edata); } static int ethtool_nway_reset(struct net_device *dev) { if (!dev->ethtool_ops->nway_reset) return -EOPNOTSUPP; return dev->ethtool_ops->nway_reset(dev); } static int ethtool_get_link(struct net_device *dev, char __user *useraddr) { struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; if (!dev->ethtool_ops->get_link) return -EOPNOTSUPP; edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev); if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr, int (*getter)(struct net_device *, struct ethtool_eeprom *, u8 *), u32 total_len) { struct ethtool_eeprom eeprom; void __user *userbuf = useraddr + sizeof(eeprom); u32 bytes_remaining; u8 *data; int ret = 0; if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) return -EFAULT; /* Check for wrap and zero */ if (eeprom.offset + eeprom.len <= eeprom.offset) return -EINVAL; /* Check for exceeding total eeprom len */ if (eeprom.offset + eeprom.len > total_len) return -EINVAL; data = kzalloc(PAGE_SIZE, GFP_USER); if (!data) return -ENOMEM; bytes_remaining = eeprom.len; while (bytes_remaining > 0) { eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); ret = getter(dev, &eeprom, data); if (ret) break; if (copy_to_user(userbuf, data, eeprom.len)) { ret = -EFAULT; break; } userbuf += eeprom.len; eeprom.offset += eeprom.len; bytes_remaining -= eeprom.len; } eeprom.len = userbuf - (useraddr + sizeof(eeprom)); eeprom.offset -= eeprom.len; if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) ret = -EFAULT; kfree(data); return ret; } static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->get_eeprom || !ops->get_eeprom_len || !ops->get_eeprom_len(dev)) return -EOPNOTSUPP; return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom, ops->get_eeprom_len(dev)); } static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) { struct ethtool_eeprom eeprom; const struct ethtool_ops *ops = dev->ethtool_ops; void __user *userbuf = useraddr + sizeof(eeprom); u32 bytes_remaining; u8 *data; int ret = 0; if (!ops->set_eeprom || !ops->get_eeprom_len || !ops->get_eeprom_len(dev)) return -EOPNOTSUPP; if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) return -EFAULT; /* Check for wrap and zero */ if (eeprom.offset + eeprom.len <= eeprom.offset) return -EINVAL; /* Check for exceeding total eeprom len */ if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) return -EINVAL; data = kzalloc(PAGE_SIZE, GFP_USER); if (!data) return -ENOMEM; bytes_remaining = eeprom.len; while (bytes_remaining > 0) { eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); if (copy_from_user(data, userbuf, eeprom.len)) { ret = -EFAULT; break; } ret = ops->set_eeprom(dev, &eeprom, data); if (ret) break; userbuf += eeprom.len; eeprom.offset += eeprom.len; bytes_remaining -= eeprom.len; } kfree(data); return ret; } static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; dev->ethtool_ops->get_coalesce(dev, &coalesce); if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) return -EFAULT; return 0; } static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce; if (!dev->ethtool_ops->set_coalesce) return -EOPNOTSUPP; if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) return -EFAULT; return dev->ethtool_ops->set_coalesce(dev, &coalesce); } static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) { struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM }; if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; dev->ethtool_ops->get_ringparam(dev, &ringparam); if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) return -EFAULT; return 0; } static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; dev->ethtool_ops->get_ringparam(dev, &max); /* ensure new ring parameters are within the maximums */ if (ringparam.rx_pending > max.rx_max_pending || ringparam.rx_mini_pending > max.rx_mini_max_pending || ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending || ringparam.tx_pending > max.tx_max_pending) return -EINVAL; return dev->ethtool_ops->set_ringparam(dev, &ringparam); } static noinline_for_stack int ethtool_get_channels(struct net_device *dev, void __user *useraddr) { struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; if (!dev->ethtool_ops->get_channels) return -EOPNOTSUPP; dev->ethtool_ops->get_channels(dev, &channels); if (copy_to_user(useraddr, &channels, sizeof(channels))) return -EFAULT; return 0; } static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS }; u32 max_rx_in_use = 0; if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; dev->ethtool_ops->get_channels(dev, &max); /* ensure new counts are within the maximums */ if ((channels.rx_count > max.max_rx) || (channels.tx_count > max.max_tx) || (channels.combined_count > max.max_combined) || (channels.other_count > max.max_other)) return -EINVAL; /* ensure the new Rx count fits within the configured Rx flow * indirection table settings */ if (netif_is_rxfh_configured(dev) && !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && (channels.combined_count + channels.rx_count) <= max_rx_in_use) return -EINVAL; return dev->ethtool_ops->set_channels(dev, &channels); } static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) { struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; dev->ethtool_ops->get_pauseparam(dev, &pauseparam); if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) return -EFAULT; return 0; } static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) { struct ethtool_pauseparam pauseparam; if (!dev->ethtool_ops->set_pauseparam) return -EOPNOTSUPP; if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) return -EFAULT; return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); } static int ethtool_self_test(struct net_device *dev, char __user *useraddr) { struct ethtool_test test; const struct ethtool_ops *ops = dev->ethtool_ops; u64 *data; int ret, test_len; if (!ops->self_test || !ops->get_sset_count) return -EOPNOTSUPP; test_len = ops->get_sset_count(dev, ETH_SS_TEST); if (test_len < 0) return test_len; WARN_ON(test_len == 0); if (copy_from_user(&test, useraddr, sizeof(test))) return -EFAULT; test.len = test_len; data = kcalloc(test_len, sizeof(u64), GFP_USER); if (!data) return -ENOMEM; ops->self_test(dev, &test, data); ret = -EFAULT; if (copy_to_user(useraddr, &test, sizeof(test))) goto out; useraddr += sizeof(test); if (copy_to_user(useraddr, data, test.len * sizeof(u64))) goto out; ret = 0; out: kfree(data); return ret; } static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) { struct ethtool_gstrings gstrings; u8 *data; int ret; if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) return -EFAULT; ret = __ethtool_get_sset_count(dev, gstrings.string_set); if (ret < 0) return ret; if (ret > S32_MAX / ETH_GSTRING_LEN) return -ENOMEM; WARN_ON_ONCE(!ret); gstrings.len = ret; if (gstrings.len) { data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); if (!data) return -ENOMEM; __ethtool_get_strings(dev, gstrings.string_set, data); } else { data = NULL; } ret = -EFAULT; if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) goto out; useraddr += sizeof(gstrings); if (gstrings.len && copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) goto out; ret = 0; out: vfree(data); return ret; } static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) { struct ethtool_value id; static bool busy; const struct ethtool_ops *ops = dev->ethtool_ops; int rc; if (!ops->set_phys_id) return -EOPNOTSUPP; if (busy) return -EBUSY; if (copy_from_user(&id, useraddr, sizeof(id))) return -EFAULT; rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); if (rc < 0) return rc; /* Drop the RTNL lock while waiting, but prevent reentry or * removal of the device. */ busy = true; dev_hold(dev); rtnl_unlock(); if (rc == 0) { /* Driver will handle this itself */ schedule_timeout_interruptible( id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT); } else { /* Driver expects to be called at twice the frequency in rc */ int n = rc * 2, i, interval = HZ / n; /* Count down seconds */ do { /* Count down iterations per second */ i = n; do { rtnl_lock(); rc = ops->set_phys_id(dev, (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); rtnl_unlock(); if (rc) break; schedule_timeout_interruptible(interval); } while (!signal_pending(current) && --i != 0); } while (!signal_pending(current) && (id.data == 0 || --id.data != 0)); } rtnl_lock(); dev_put(dev); busy = false; (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); return rc; } static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) { struct ethtool_stats stats; const struct ethtool_ops *ops = dev->ethtool_ops; u64 *data; int ret, n_stats; if (!ops->get_ethtool_stats || !ops->get_sset_count) return -EOPNOTSUPP; n_stats = ops->get_sset_count(dev, ETH_SS_STATS); if (n_stats < 0) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) return -ENOMEM; WARN_ON_ONCE(!n_stats); if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; stats.n_stats = n_stats; if (n_stats) { data = vzalloc(array_size(n_stats, sizeof(u64))); if (!data) return -ENOMEM; ops->get_ethtool_stats(dev, &stats, data); } else { data = NULL; } ret = -EFAULT; if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) goto out; ret = 0; out: vfree(data); return ret; } static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; struct ethtool_stats stats; u64 *data; int ret, n_stats; if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count)) return -EOPNOTSUPP; if (dev->phydev && !ops->get_ethtool_phy_stats) n_stats = phy_ethtool_get_sset_count(dev->phydev); else n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); if (n_stats < 0) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) return -ENOMEM; WARN_ON_ONCE(!n_stats); if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; stats.n_stats = n_stats; if (n_stats) { data = vzalloc(array_size(n_stats, sizeof(u64))); if (!data) return -ENOMEM; if (dev->phydev && !ops->get_ethtool_phy_stats) { ret = phy_ethtool_get_stats(dev->phydev, &stats, data); if (ret < 0) goto out; } else { ops->get_ethtool_phy_stats(dev, &stats, data); } } else { data = NULL; } ret = -EFAULT; if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64))) goto out; ret = 0; out: vfree(data); return ret; } static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) { struct ethtool_perm_addr epaddr; if (copy_from_user(&epaddr, useraddr, sizeof(epaddr))) return -EFAULT; if (epaddr.size < dev->addr_len) return -ETOOSMALL; epaddr.size = dev->addr_len; if (copy_to_user(useraddr, &epaddr, sizeof(epaddr))) return -EFAULT; useraddr += sizeof(epaddr); if (copy_to_user(useraddr, dev->perm_addr, epaddr.size)) return -EFAULT; return 0; } static int ethtool_get_value(struct net_device *dev, char __user *useraddr, u32 cmd, u32 (*actor)(struct net_device *)) { struct ethtool_value edata = { .cmd = cmd }; if (!actor) return -EOPNOTSUPP; edata.data = actor(dev); if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr, void (*actor)(struct net_device *, u32)) { struct ethtool_value edata; if (!actor) return -EOPNOTSUPP; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; actor(dev, edata.data); return 0; } static int ethtool_set_value(struct net_device *dev, char __user *useraddr, int (*actor)(struct net_device *, u32)) { struct ethtool_value edata; if (!actor) return -EOPNOTSUPP; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; return actor(dev, edata.data); } static noinline_for_stack int ethtool_flash_device(struct net_device *dev, char __user *useraddr) { struct ethtool_flash efl; if (copy_from_user(&efl, useraddr, sizeof(efl))) return -EFAULT; if (!dev->ethtool_ops->flash_device) return -EOPNOTSUPP; efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; return dev->ethtool_ops->flash_device(dev, &efl); } static int ethtool_set_dump(struct net_device *dev, void __user *useraddr) { struct ethtool_dump dump; if (!dev->ethtool_ops->set_dump) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) return -EFAULT; return dev->ethtool_ops->set_dump(dev, &dump); } static int ethtool_get_dump_flag(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_dump dump; const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->get_dump_flag) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) return -EFAULT; ret = ops->get_dump_flag(dev, &dump); if (ret) return ret; if (copy_to_user(useraddr, &dump, sizeof(dump))) return -EFAULT; return 0; } static int ethtool_get_dump_data(struct net_device *dev, void __user *useraddr) { int ret; __u32 len; struct ethtool_dump dump, tmp; const struct ethtool_ops *ops = dev->ethtool_ops; void *data = NULL; if (!ops->get_dump_data || !ops->get_dump_flag) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) return -EFAULT; memset(&tmp, 0, sizeof(tmp)); tmp.cmd = ETHTOOL_GET_DUMP_FLAG; ret = ops->get_dump_flag(dev, &tmp); if (ret) return ret; len = min(tmp.len, dump.len); if (!len) return -EFAULT; /* Don't ever let the driver think there's more space available * than it requested with .get_dump_flag(). */ dump.len = len; /* Always allocate enough space to hold the whole thing so that the * driver does not need to check the length and bother with partial * dumping. */ data = vzalloc(tmp.len); if (!data) return -ENOMEM; ret = ops->get_dump_data(dev, &dump, data); if (ret) goto out; /* There are two sane possibilities: * 1. The driver's .get_dump_data() does not touch dump.len. * 2. Or it may set dump.len to how much it really writes, which * should be tmp.len (or len if it can do a partial dump). * In any case respond to userspace with the actual length of data * it's receiving. */ WARN_ON(dump.len != len && dump.len != tmp.len); dump.len = len; if (copy_to_user(useraddr, &dump, sizeof(dump))) { ret = -EFAULT; goto out; } useraddr += offsetof(struct ethtool_dump, data); if (copy_to_user(useraddr, data, len)) ret = -EFAULT; out: vfree(data); return ret; } static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) { int err = 0; struct ethtool_ts_info info; const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GET_TS_INFO; if (phydev && phydev->drv && phydev->drv->ts_info) { err = phydev->drv->ts_info(phydev, &info); } else if (ops->get_ts_info) { err = ops->get_ts_info(dev, &info); } else { info.so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; info.phc_index = -1; } if (err) return err; if (copy_to_user(useraddr, &info, sizeof(info))) err = -EFAULT; return err; } static int __ethtool_get_module_info(struct net_device *dev, struct ethtool_modinfo *modinfo) { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; if (dev->sfp_bus) return sfp_get_module_info(dev->sfp_bus, modinfo); if (phydev && phydev->drv && phydev->drv->module_info) return phydev->drv->module_info(phydev, modinfo); if (ops->get_module_info) return ops->get_module_info(dev, modinfo); return -EOPNOTSUPP; } static int ethtool_get_module_info(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_modinfo modinfo; if (copy_from_user(&modinfo, useraddr, sizeof(modinfo))) return -EFAULT; ret = __ethtool_get_module_info(dev, &modinfo); if (ret) return ret; if (copy_to_user(useraddr, &modinfo, sizeof(modinfo))) return -EFAULT; return 0; } static int __ethtool_get_module_eeprom(struct net_device *dev, struct ethtool_eeprom *ee, u8 *data) { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; if (dev->sfp_bus) return sfp_get_module_eeprom(dev->sfp_bus, ee, data); if (phydev && phydev->drv && phydev->drv->module_eeprom) return phydev->drv->module_eeprom(phydev, ee, data); if (ops->get_module_eeprom) return ops->get_module_eeprom(dev, ee, data); return -EOPNOTSUPP; } static int ethtool_get_module_eeprom(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_modinfo modinfo; ret = __ethtool_get_module_info(dev, &modinfo); if (ret) return ret; return ethtool_get_any_eeprom(dev, useraddr, __ethtool_get_module_eeprom, modinfo.eeprom_len); } static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_RX_COPYBREAK: case ETHTOOL_TX_COPYBREAK: if (tuna->len != sizeof(u32) || tuna->type_id != ETHTOOL_TUNABLE_U32) return -EINVAL; break; case ETHTOOL_PFC_PREVENTION_TOUT: if (tuna->len != sizeof(u16) || tuna->type_id != ETHTOOL_TUNABLE_U16) return -EINVAL; break; default: return -EINVAL; } return 0; } static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_tunable tuna; const struct ethtool_ops *ops = dev->ethtool_ops; void *data; if (!ops->get_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_tunable_valid(&tuna); if (ret) return ret; data = kzalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; ret = ops->get_tunable(dev, &tuna, data); if (ret) goto out; useraddr += sizeof(tuna); ret = -EFAULT; if (copy_to_user(useraddr, data, tuna.len)) goto out; ret = 0; out: kfree(data); return ret; } static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_tunable tuna; const struct ethtool_ops *ops = dev->ethtool_ops; void *data; if (!ops->set_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_tunable_valid(&tuna); if (ret) return ret; useraddr += sizeof(tuna); data = memdup_user(useraddr, tuna.len); if (IS_ERR(data)) return PTR_ERR(data); ret = ops->set_tunable(dev, &tuna, data); kfree(data); return ret; } static noinline_for_stack int ethtool_get_per_queue_coalesce(struct net_device *dev, void __user *useraddr, struct ethtool_per_queue_op *per_queue_opt) { u32 bit; int ret; DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); if (!dev->ethtool_ops->get_per_queue_coalesce) return -EOPNOTSUPP; useraddr += sizeof(*per_queue_opt); bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE); for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce); if (ret != 0) return ret; if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) return -EFAULT; useraddr += sizeof(coalesce); } return 0; } static noinline_for_stack int ethtool_set_per_queue_coalesce(struct net_device *dev, void __user *useraddr, struct ethtool_per_queue_op *per_queue_opt) { u32 bit; int i, ret = 0; int n_queue; struct ethtool_coalesce *backup = NULL, *tmp = NULL; DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); if ((!dev->ethtool_ops->set_per_queue_coalesce) || (!dev->ethtool_ops->get_per_queue_coalesce)) return -EOPNOTSUPP; useraddr += sizeof(*per_queue_opt); bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE); n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); if (!backup) return -ENOMEM; for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { struct ethtool_coalesce coalesce; ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp); if (ret != 0) goto roll_back; tmp++; if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) { ret = -EFAULT; goto roll_back; } ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce); if (ret != 0) goto roll_back; useraddr += sizeof(coalesce); } roll_back: if (ret != 0) { tmp = backup; for_each_set_bit(i, queue_mask, bit) { dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp); tmp++; } } kfree(backup); return ret; } static int noinline_for_stack ethtool_set_per_queue(struct net_device *dev, void __user *useraddr, u32 sub_cmd) { struct ethtool_per_queue_op per_queue_opt; if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) return -EFAULT; if (per_queue_opt.sub_command != sub_cmd) return -EINVAL; switch (per_queue_opt.sub_command) { case ETHTOOL_GCOALESCE: return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); case ETHTOOL_SCOALESCE: return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); default: return -EOPNOTSUPP; }; } static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_PHY_DOWNSHIFT: if (tuna->len != sizeof(u8) || tuna->type_id != ETHTOOL_TUNABLE_U8) return -EINVAL; break; default: return -EINVAL; } return 0; } static int get_phy_tunable(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_tunable tuna; struct phy_device *phydev = dev->phydev; void *data; if (!(phydev && phydev->drv && phydev->drv->get_tunable)) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_phy_tunable_valid(&tuna); if (ret) return ret; data = kzalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; mutex_lock(&phydev->lock); ret = phydev->drv->get_tunable(phydev, &tuna, data); mutex_unlock(&phydev->lock); if (ret) goto out; useraddr += sizeof(tuna); ret = -EFAULT; if (copy_to_user(useraddr, data, tuna.len)) goto out; ret = 0; out: kfree(data); return ret; } static int set_phy_tunable(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_tunable tuna; struct phy_device *phydev = dev->phydev; void *data; if (!(phydev && phydev->drv && phydev->drv->set_tunable)) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_phy_tunable_valid(&tuna); if (ret) return ret; useraddr += sizeof(tuna); data = memdup_user(useraddr, tuna.len); if (IS_ERR(data)) return PTR_ERR(data); mutex_lock(&phydev->lock); ret = phydev->drv->set_tunable(phydev, &tuna, data); mutex_unlock(&phydev->lock); kfree(data); return ret; } static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr) { struct ethtool_fecparam fecparam = { ETHTOOL_GFECPARAM }; int rc; if (!dev->ethtool_ops->get_fecparam) return -EOPNOTSUPP; rc = dev->ethtool_ops->get_fecparam(dev, &fecparam); if (rc) return rc; if (copy_to_user(useraddr, &fecparam, sizeof(fecparam))) return -EFAULT; return 0; } static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) { struct ethtool_fecparam fecparam; if (!dev->ethtool_ops->set_fecparam) return -EOPNOTSUPP; if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) return -EFAULT; return dev->ethtool_ops->set_fecparam(dev, &fecparam); } /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) { struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); void __user *useraddr = ifr->ifr_data; u32 ethcmd, sub_cmd; int rc; netdev_features_t old_features; if (!dev || !netif_device_present(dev)) return -ENODEV; if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; if (ethcmd == ETHTOOL_PERQUEUE) { if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd))) return -EFAULT; } else { sub_cmd = ethcmd; } /* Allow some commands to be done by anyone */ switch (sub_cmd) { case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: case ETHTOOL_GLINK: case ETHTOOL_GCOALESCE: case ETHTOOL_GRINGPARAM: case ETHTOOL_GPAUSEPARAM: case ETHTOOL_GRXCSUM: case ETHTOOL_GTXCSUM: case ETHTOOL_GSG: case ETHTOOL_GSSET_INFO: case ETHTOOL_GSTRINGS: case ETHTOOL_GSTATS: case ETHTOOL_GPHYSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: case ETHTOOL_GFLAGS: case ETHTOOL_GPFLAGS: case ETHTOOL_GRXFH: case ETHTOOL_GRXRINGS: case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: case ETHTOOL_GRXFHINDIR: case ETHTOOL_GRSSH: case ETHTOOL_GFEATURES: case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: case ETHTOOL_GTUNABLE: case ETHTOOL_PHY_GTUNABLE: case ETHTOOL_GLINKSETTINGS: case ETHTOOL_GFECPARAM: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; } if (dev->ethtool_ops->begin) { rc = dev->ethtool_ops->begin(dev); if (rc < 0) return rc; } old_features = dev->features; switch (ethcmd) { case ETHTOOL_GSET: rc = ethtool_get_settings(dev, useraddr); break; case ETHTOOL_SSET: rc = ethtool_set_settings(dev, useraddr); break; case ETHTOOL_GDRVINFO: rc = ethtool_get_drvinfo(dev, useraddr); break; case ETHTOOL_GREGS: rc = ethtool_get_regs(dev, useraddr); break; case ETHTOOL_GWOL: rc = ethtool_get_wol(dev, useraddr); break; case ETHTOOL_SWOL: rc = ethtool_set_wol(dev, useraddr); break; case ETHTOOL_GMSGLVL: rc = ethtool_get_value(dev, useraddr, ethcmd, dev->ethtool_ops->get_msglevel); break; case ETHTOOL_SMSGLVL: rc = ethtool_set_value_void(dev, useraddr, dev->ethtool_ops->set_msglevel); break; case ETHTOOL_GEEE: rc = ethtool_get_eee(dev, useraddr); break; case ETHTOOL_SEEE: rc = ethtool_set_eee(dev, useraddr); break; case ETHTOOL_NWAY_RST: rc = ethtool_nway_reset(dev); break; case ETHTOOL_GLINK: rc = ethtool_get_link(dev, useraddr); break; case ETHTOOL_GEEPROM: rc = ethtool_get_eeprom(dev, useraddr); break; case ETHTOOL_SEEPROM: rc = ethtool_set_eeprom(dev, useraddr); break; case ETHTOOL_GCOALESCE: rc = ethtool_get_coalesce(dev, useraddr); break; case ETHTOOL_SCOALESCE: rc = ethtool_set_coalesce(dev, useraddr); break; case ETHTOOL_GRINGPARAM: rc = ethtool_get_ringparam(dev, useraddr); break; case ETHTOOL_SRINGPARAM: rc = ethtool_set_ringparam(dev, useraddr); break; case ETHTOOL_GPAUSEPARAM: rc = ethtool_get_pauseparam(dev, useraddr); break; case ETHTOOL_SPAUSEPARAM: rc = ethtool_set_pauseparam(dev, useraddr); break; case ETHTOOL_TEST: rc = ethtool_self_test(dev, useraddr); break; case ETHTOOL_GSTRINGS: rc = ethtool_get_strings(dev, useraddr); break; case ETHTOOL_PHYS_ID: rc = ethtool_phys_id(dev, useraddr); break; case ETHTOOL_GSTATS: rc = ethtool_get_stats(dev, useraddr); break; case ETHTOOL_GPERMADDR: rc = ethtool_get_perm_addr(dev, useraddr); break; case ETHTOOL_GFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, __ethtool_get_flags); break; case ETHTOOL_SFLAGS: rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); break; case ETHTOOL_GPFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, dev->ethtool_ops->get_priv_flags); break; case ETHTOOL_SPFLAGS: rc = ethtool_set_value(dev, useraddr, dev->ethtool_ops->set_priv_flags); break; case ETHTOOL_GRXFH: case ETHTOOL_GRXRINGS: case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); break; case ETHTOOL_SRXFH: case ETHTOOL_SRXCLSRLDEL: case ETHTOOL_SRXCLSRLINS: rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); break; case ETHTOOL_FLASHDEV: rc = ethtool_flash_device(dev, useraddr); break; case ETHTOOL_RESET: rc = ethtool_reset(dev, useraddr); break; case ETHTOOL_GSSET_INFO: rc = ethtool_get_sset_info(dev, useraddr); break; case ETHTOOL_GRXFHINDIR: rc = ethtool_get_rxfh_indir(dev, useraddr); break; case ETHTOOL_SRXFHINDIR: rc = ethtool_set_rxfh_indir(dev, useraddr); break; case ETHTOOL_GRSSH: rc = ethtool_get_rxfh(dev, useraddr); break; case ETHTOOL_SRSSH: rc = ethtool_set_rxfh(dev, useraddr); break; case ETHTOOL_GFEATURES: rc = ethtool_get_features(dev, useraddr); break; case ETHTOOL_SFEATURES: rc = ethtool_set_features(dev, useraddr); break; case ETHTOOL_GTXCSUM: case ETHTOOL_GRXCSUM: case ETHTOOL_GSG: case ETHTOOL_GTSO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: rc = ethtool_get_one_feature(dev, useraddr, ethcmd); break; case ETHTOOL_STXCSUM: case ETHTOOL_SRXCSUM: case ETHTOOL_SSG: case ETHTOOL_STSO: case ETHTOOL_SGSO: case ETHTOOL_SGRO: rc = ethtool_set_one_feature(dev, useraddr, ethcmd); break; case ETHTOOL_GCHANNELS: rc = ethtool_get_channels(dev, useraddr); break; case ETHTOOL_SCHANNELS: rc = ethtool_set_channels(dev, useraddr); break; case ETHTOOL_SET_DUMP: rc = ethtool_set_dump(dev, useraddr); break; case ETHTOOL_GET_DUMP_FLAG: rc = ethtool_get_dump_flag(dev, useraddr); break; case ETHTOOL_GET_DUMP_DATA: rc = ethtool_get_dump_data(dev, useraddr); break; case ETHTOOL_GET_TS_INFO: rc = ethtool_get_ts_info(dev, useraddr); break; case ETHTOOL_GMODULEINFO: rc = ethtool_get_module_info(dev, useraddr); break; case ETHTOOL_GMODULEEEPROM: rc = ethtool_get_module_eeprom(dev, useraddr); break; case ETHTOOL_GTUNABLE: rc = ethtool_get_tunable(dev, useraddr); break; case ETHTOOL_STUNABLE: rc = ethtool_set_tunable(dev, useraddr); break; case ETHTOOL_GPHYSTATS: rc = ethtool_get_phy_stats(dev, useraddr); break; case ETHTOOL_PERQUEUE: rc = ethtool_set_per_queue(dev, useraddr, sub_cmd); break; case ETHTOOL_GLINKSETTINGS: rc = ethtool_get_link_ksettings(dev, useraddr); break; case ETHTOOL_SLINKSETTINGS: rc = ethtool_set_link_ksettings(dev, useraddr); break; case ETHTOOL_PHY_GTUNABLE: rc = get_phy_tunable(dev, useraddr); break; case ETHTOOL_PHY_STUNABLE: rc = set_phy_tunable(dev, useraddr); break; case ETHTOOL_GFECPARAM: rc = ethtool_get_fecparam(dev, useraddr); break; case ETHTOOL_SFECPARAM: rc = ethtool_set_fecparam(dev, useraddr); break; default: rc = -EOPNOTSUPP; } if (dev->ethtool_ops->complete) dev->ethtool_ops->complete(dev); if (old_features != dev->features) netdev_features_change(dev); return rc; } |
85 85 75 75 74 75 85 85 85 80 80 80 80 75 75 75 75 75 73 74 80 80 80 80 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 | #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/smp.h> #include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" static void blk_mq_sysfs_release(struct kobject *kobj) { } static void blk_mq_hw_sysfs_release(struct kobject *kobj) { struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); if (hctx->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(hctx->srcu); blk_free_flush_queue(hctx->fq); sbitmap_free(&hctx->ctx_map); free_cpumask_var(hctx->cpumask); kfree(hctx->ctxs); kfree(hctx); } struct blk_mq_ctx_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_mq_ctx *, char *); ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); }; struct blk_mq_hw_ctx_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_mq_hw_ctx *, char *); ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); }; static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, char *page) { struct blk_mq_ctx_sysfs_entry *entry; struct blk_mq_ctx *ctx; struct request_queue *q; ssize_t res; entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); ctx = container_of(kobj, struct blk_mq_ctx, kobj); q = ctx->queue; if (!entry->show) return -EIO; res = -ENOENT; mutex_lock(&q->sysfs_lock); if (!blk_queue_dying(q)) res = entry->show(ctx, page); mutex_unlock(&q->sysfs_lock); return res; } static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct blk_mq_ctx_sysfs_entry *entry; struct blk_mq_ctx *ctx; struct request_queue *q; ssize_t res; entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); ctx = container_of(kobj, struct blk_mq_ctx, kobj); q = ctx->queue; if (!entry->store) return -EIO; res = -ENOENT; mutex_lock(&q->sysfs_lock); if (!blk_queue_dying(q)) res = entry->store(ctx, page, length); mutex_unlock(&q->sysfs_lock); return res; } static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, struct attribute *attr, char *page) { struct blk_mq_hw_ctx_sysfs_entry *entry; struct blk_mq_hw_ctx *hctx; struct request_queue *q; ssize_t res; entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); q = hctx->queue; if (!entry->show) return -EIO; res = -ENOENT; mutex_lock(&q->sysfs_lock); if (!blk_queue_dying(q)) res = entry->show(hctx, page); mutex_unlock(&q->sysfs_lock); return res; } static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct blk_mq_hw_ctx_sysfs_entry *entry; struct blk_mq_hw_ctx *hctx; struct request_queue *q; ssize_t res; entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); q = hctx->queue; if (!entry->store) return -EIO; res = -ENOENT; mutex_lock(&q->sysfs_lock); if (!blk_queue_dying(q)) res = entry->store(hctx, page, length); mutex_unlock(&q->sysfs_lock); return res; } static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { return sprintf(page, "%u\n", hctx->tags->nr_tags); } static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { return sprintf(page, "%u\n", hctx->tags->nr_reserved_tags); } static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { const size_t size = PAGE_SIZE - 1; unsigned int i, first = 1; int ret = 0, pos = 0; for_each_cpu(i, hctx->cpumask) { if (first) ret = snprintf(pos + page, size - pos, "%u", i); else ret = snprintf(pos + page, size - pos, ", %u", i); if (ret >= size - pos) break; first = 0; pos += ret; } ret = snprintf(pos + page, size + 1 - pos, "\n"); return pos + ret; } static struct attribute *default_ctx_attrs[] = { NULL, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { .attr = {.name = "nr_tags", .mode = 0444 }, .show = blk_mq_hw_sysfs_nr_tags_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { .attr = {.name = "nr_reserved_tags", .mode = 0444 }, .show = blk_mq_hw_sysfs_nr_reserved_tags_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = 0444 }, .show = blk_mq_hw_sysfs_cpus_show, }; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_nr_tags.attr, &blk_mq_hw_sysfs_nr_reserved_tags.attr, &blk_mq_hw_sysfs_cpus.attr, NULL, }; static const struct sysfs_ops blk_mq_sysfs_ops = { .show = blk_mq_sysfs_show, .store = blk_mq_sysfs_store, }; static const struct sysfs_ops blk_mq_hw_sysfs_ops = { .show = blk_mq_hw_sysfs_show, .store = blk_mq_hw_sysfs_store, }; static struct kobj_type blk_mq_ktype = { .sysfs_ops = &blk_mq_sysfs_ops, .release = blk_mq_sysfs_release, }; static struct kobj_type blk_mq_ctx_ktype = { .sysfs_ops = &blk_mq_sysfs_ops, .default_attrs = default_ctx_attrs, .release = blk_mq_sysfs_release, }; static struct kobj_type blk_mq_hw_ktype = { .sysfs_ops = &blk_mq_hw_sysfs_ops, .default_attrs = default_hw_ctx_attrs, .release = blk_mq_hw_sysfs_release, }; static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) { struct blk_mq_ctx *ctx; int i; if (!hctx->nr_ctx) return; hctx_for_each_ctx(hctx, ctx, i) kobject_del(&ctx->kobj); kobject_del(&hctx->kobj); } static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct blk_mq_ctx *ctx; int i, ret; if (!hctx->nr_ctx) return 0; ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); if (ret) return ret; hctx_for_each_ctx(hctx, ctx, i) { ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); if (ret) break; } return ret; } void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; lockdep_assert_held(&q->sysfs_dir_lock); queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); kobject_del(&q->mq_kobj); kobject_put(&dev->kobj); q->mq_sysfs_init_done = false; } void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) { kobject_init(&hctx->kobj, &blk_mq_hw_ktype); } void blk_mq_sysfs_deinit(struct request_queue *q) { struct blk_mq_ctx *ctx; int cpu; for_each_possible_cpu(cpu) { ctx = per_cpu_ptr(q->queue_ctx, cpu); kobject_put(&ctx->kobj); } kobject_put(&q->mq_kobj); } void blk_mq_sysfs_init(struct request_queue *q) { struct blk_mq_ctx *ctx; int cpu; kobject_init(&q->mq_kobj, &blk_mq_ktype); for_each_possible_cpu(cpu) { ctx = per_cpu_ptr(q->queue_ctx, cpu); kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } } int __blk_mq_register_dev(struct device *dev, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int ret, i; WARN_ON_ONCE(!q->kobj.parent); lockdep_assert_held(&q->sysfs_dir_lock); ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) goto out; kobject_uevent(&q->mq_kobj, KOBJ_ADD); queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) goto unreg; } q->mq_sysfs_init_done = true; out: return ret; unreg: while (--i >= 0) blk_mq_unregister_hctx(q->queue_hw_ctx[i]); kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); kobject_del(&q->mq_kobj); kobject_put(&dev->kobj); return ret; } int blk_mq_register_dev(struct device *dev, struct request_queue *q) { int ret; mutex_lock(&q->sysfs_lock); ret = __blk_mq_register_dev(dev, q); mutex_unlock(&q->sysfs_lock); return ret; } EXPORT_SYMBOL_GPL(blk_mq_register_dev); void blk_mq_sysfs_unregister(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) goto unlock; queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); unlock: mutex_unlock(&q->sysfs_dir_lock); } int blk_mq_sysfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i, ret = 0; mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) goto unlock; queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) break; } unlock: mutex_unlock(&q->sysfs_dir_lock); return ret; } |
30 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 | /* * Copyright 2002-2004, Instant802 Networks, Inc. * Copyright 2008, Jouni Malinen <j@w1.fi> * Copyright (C) 2016-2017 Intel Deutschland GmbH * Copyright (C) 2020-2021 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include <linux/netdevice.h> #include <linux/types.h> #include <linux/skbuff.h> #include <linux/compiler.h> #include <linux/ieee80211.h> #include <linux/gfp.h> #include <asm/unaligned.h> #include <net/mac80211.h> #include <crypto/aes.h> #include <crypto/algapi.h> #include "ieee80211_i.h" #include "michael.h" #include "tkip.h" #include "aes_ccm.h" #include "aes_cmac.h" #include "aes_gmac.h" #include "aes_gcm.h" #include "wpa.h" ieee80211_tx_result ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx) { u8 *data, *key, *mic; size_t data_len; unsigned int hdrlen; struct ieee80211_hdr *hdr; struct sk_buff *skb = tx->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int tail; hdr = (struct ieee80211_hdr *)skb->data; if (!tx->key || tx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP || skb->len < 24 || !ieee80211_is_data_present(hdr->frame_control)) return TX_CONTINUE; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (skb->len < hdrlen) return TX_DROP; data = skb->data + hdrlen; data_len = skb->len - hdrlen; if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE)) { /* Need to use software crypto for the test */ info->control.hw_key = NULL; } if (info->control.hw_key && (info->flags & IEEE80211_TX_CTL_DONTFRAG || ieee80211_hw_check(&tx->local->hw, SUPPORTS_TX_FRAG)) && !(tx->key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | IEEE80211_KEY_FLAG_PUT_MIC_SPACE))) { /* hwaccel - with no need for SW-generated MMIC or MIC space */ return TX_CONTINUE; } tail = MICHAEL_MIC_LEN; if (!info->control.hw_key) tail += IEEE80211_TKIP_ICV_LEN; if (WARN(skb_tailroom(skb) < tail || skb_headroom(skb) < IEEE80211_TKIP_IV_LEN, "mmic: not enough head/tail (%d/%d,%d/%d)\n", skb_headroom(skb), IEEE80211_TKIP_IV_LEN, skb_tailroom(skb), tail)) return TX_DROP; mic = skb_put(skb, MICHAEL_MIC_LEN); if (tx->key->conf.flags & IEEE80211_KEY_FLAG_PUT_MIC_SPACE) { /* Zeroed MIC can help with debug */ memset(mic, 0, MICHAEL_MIC_LEN); return TX_CONTINUE; } key = &tx->key->conf.key[NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY]; michael_mic(key, hdr, data, data_len, mic); if (unlikely(info->flags & IEEE80211_TX_INTFL_TKIP_MIC_FAILURE)) mic[0]++; return TX_CONTINUE; } ieee80211_rx_result ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx) { u8 *data, *key = NULL; size_t data_len; unsigned int hdrlen; u8 mic[MICHAEL_MIC_LEN]; struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; /* * it makes no sense to check for MIC errors on anything other * than data frames. */ if (!ieee80211_is_data_present(hdr->frame_control)) return RX_CONTINUE; /* * No way to verify the MIC if the hardware stripped it or * the IV with the key index. In this case we have solely rely * on the driver to set RX_FLAG_MMIC_ERROR in the event of a * MIC failure report. */ if (status->flag & (RX_FLAG_MMIC_STRIPPED | RX_FLAG_IV_STRIPPED)) { if (status->flag & RX_FLAG_MMIC_ERROR) goto mic_fail_no_key; if (!(status->flag & RX_FLAG_IV_STRIPPED) && rx->key && rx->key->conf.cipher == WLAN_CIPHER_SUITE_TKIP) goto update_iv; return RX_CONTINUE; } /* * Some hardware seems to generate Michael MIC failure reports; even * though, the frame was not encrypted with TKIP and therefore has no * MIC. Ignore the flag them to avoid triggering countermeasures. */ if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP || !(status->flag & RX_FLAG_DECRYPTED)) return RX_CONTINUE; if (rx->sdata->vif.type == NL80211_IFTYPE_AP && rx->key->conf.keyidx) { /* * APs with pairwise keys should never receive Michael MIC * errors for non-zero keyidx because these are reserved for * group keys and only the AP is sending real multicast * frames in the BSS. */ return RX_DROP_UNUSABLE; } if (status->flag & RX_FLAG_MMIC_ERROR) goto mic_fail; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (skb->len < hdrlen + MICHAEL_MIC_LEN) return RX_DROP_UNUSABLE; if (skb_linearize(rx->skb)) return RX_DROP_UNUSABLE; hdr = (void *)skb->data; data = skb->data + hdrlen; data_len = skb->len - hdrlen - MICHAEL_MIC_LEN; key = &rx->key->conf.key[NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY]; michael_mic(key, hdr, data, data_len, mic); if (crypto_memneq(mic, data + data_len, MICHAEL_MIC_LEN)) goto mic_fail; /* remove Michael MIC from payload */ skb_trim(skb, skb->len - MICHAEL_MIC_LEN); update_iv: /* update IV in key information to be able to detect replays */ rx->key->u.tkip.rx[rx->security_idx].iv32 = rx->tkip.iv32; rx->key->u.tkip.rx[rx->security_idx].iv16 = rx->tkip.iv16; return RX_CONTINUE; mic_fail: rx->key->u.tkip.mic_failures++; mic_fail_no_key: /* * In some cases the key can be unset - e.g. a multicast packet, in * a driver that supports HW encryption. Send up the key idx only if * the key is set. */ cfg80211_michael_mic_failure(rx->sdata->dev, hdr->addr2, is_multicast_ether_addr(hdr->addr1) ? NL80211_KEYTYPE_GROUP : NL80211_KEYTYPE_PAIRWISE, rx->key ? rx->key->conf.keyidx : -1, NULL, GFP_ATOMIC); return RX_DROP_UNUSABLE; } static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_key *key = tx->key; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); unsigned int hdrlen; int len, tail; u64 pn; u8 *pos; if (info->control.hw_key && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) { /* hwaccel - with no need for software-generated IV */ return 0; } hdrlen = ieee80211_hdrlen(hdr->frame_control); len = skb->len - hdrlen; if (info->control.hw_key) tail = 0; else tail = IEEE80211_TKIP_ICV_LEN; if (WARN_ON(skb_tailroom(skb) < tail || skb_headroom(skb) < IEEE80211_TKIP_IV_LEN)) return -1; pos = skb_push(skb, IEEE80211_TKIP_IV_LEN); memmove(pos, pos + IEEE80211_TKIP_IV_LEN, hdrlen); pos += hdrlen; /* the HW only needs room for the IV, but not the actual IV */ if (info->control.hw_key && (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) return 0; /* Increase IV for the frame */ pn = atomic64_inc_return(&key->conf.tx_pn); pos = ieee80211_tkip_add_iv(pos, &key->conf, pn); /* hwaccel - with software IV */ if (info->control.hw_key) return 0; /* Add room for ICV */ skb_put(skb, IEEE80211_TKIP_ICV_LEN); return ieee80211_tkip_encrypt_data(tx->local->wep_tx_tfm, key, skb, pos, len); } ieee80211_tx_result ieee80211_crypto_tkip_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; ieee80211_tx_set_protected(tx); skb_queue_walk(&tx->skbs, skb) { if (tkip_encrypt_skb(tx, skb) < 0) return TX_DROP; } return TX_CONTINUE; } ieee80211_rx_result ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) rx->skb->data; int hdrlen, res, hwaccel = 0; struct ieee80211_key *key = rx->key; struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); hdrlen = ieee80211_hdrlen(hdr->frame_control); if (!ieee80211_is_data(hdr->frame_control)) return RX_CONTINUE; if (!rx->sta || skb->len - hdrlen < 12) return RX_DROP_UNUSABLE; /* it may be possible to optimize this a bit more */ if (skb_linearize(rx->skb)) return RX_DROP_UNUSABLE; hdr = (void *)skb->data; /* * Let TKIP code verify IV, but skip decryption. * In the case where hardware checks the IV as well, * we don't even get here, see ieee80211_rx_h_decrypt() */ if (status->flag & RX_FLAG_DECRYPTED) hwaccel = 1; res = ieee80211_tkip_decrypt_data(rx->local->wep_rx_tfm, key, skb->data + hdrlen, skb->len - hdrlen, rx->sta->sta.addr, hdr->addr1, hwaccel, rx->security_idx, &rx->tkip.iv32, &rx->tkip.iv16); if (res != TKIP_DECRYPT_OK) return RX_DROP_UNUSABLE; /* Trim ICV */ if (!(status->flag & RX_FLAG_ICV_STRIPPED)) skb_trim(skb, skb->len - IEEE80211_TKIP_ICV_LEN); /* Remove IV */ memmove(skb->data + IEEE80211_TKIP_IV_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_TKIP_IV_LEN); return RX_CONTINUE; } static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad) { __le16 mask_fc; int a4_included, mgmt; u8 qos_tid; u16 len_a; unsigned int hdrlen; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; /* * Mask FC: zero subtype b4 b5 b6 (if not mgmt) * Retry, PwrMgt, MoreData; set Protected */ mgmt = ieee80211_is_mgmt(hdr->frame_control); mask_fc = hdr->frame_control; mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY | IEEE80211_FCTL_PM | IEEE80211_FCTL_MOREDATA); if (!mgmt) mask_fc &= ~cpu_to_le16(0x0070); mask_fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); hdrlen = ieee80211_hdrlen(hdr->frame_control); len_a = hdrlen - 2; a4_included = ieee80211_has_a4(hdr->frame_control); if (ieee80211_is_data_qos(hdr->frame_control)) qos_tid = ieee80211_get_tid(hdr); else qos_tid = 0; /* In CCM, the initial vectors (IV) used for CTR mode encryption and CBC * mode authentication are not allowed to collide, yet both are derived * from this vector b_0. We only set L := 1 here to indicate that the * data size can be represented in (L+1) bytes. The CCM layer will take * care of storing the data length in the top (L+1) bytes and setting * and clearing the other bits as is required to derive the two IVs. */ b_0[0] = 0x1; /* Nonce: Nonce Flags | A2 | PN * Nonce Flags: Priority (b0..b3) | Management (b4) | Reserved (b5..b7) */ b_0[1] = qos_tid | (mgmt << 4); memcpy(&b_0[2], hdr->addr2, ETH_ALEN); memcpy(&b_0[8], pn, IEEE80211_CCMP_PN_LEN); /* AAD (extra authenticate-only data) / masked 802.11 header * FC | A1 | A2 | A3 | SC | [A4] | [QC] */ put_unaligned_be16(len_a, &aad[0]); put_unaligned(mask_fc, (__le16 *)&aad[2]); memcpy(&aad[4], &hdr->addr1, 3 * ETH_ALEN); /* Mask Seq#, leave Frag# */ aad[22] = *((u8 *) &hdr->seq_ctrl) & 0x0f; aad[23] = 0; if (a4_included) { memcpy(&aad[24], hdr->addr4, ETH_ALEN); aad[30] = qos_tid; aad[31] = 0; } else { memset(&aad[24], 0, ETH_ALEN + IEEE80211_QOS_CTL_LEN); aad[24] = qos_tid; } } static inline void ccmp_pn2hdr(u8 *hdr, u8 *pn, int key_id) { hdr[0] = pn[5]; hdr[1] = pn[4]; hdr[2] = 0; hdr[3] = 0x20 | (key_id << 6); hdr[4] = pn[3]; hdr[5] = pn[2]; hdr[6] = pn[1]; hdr[7] = pn[0]; } static inline void ccmp_hdr2pn(u8 *pn, u8 *hdr) { pn[0] = hdr[7]; pn[1] = hdr[6]; pn[2] = hdr[5]; pn[3] = hdr[4]; pn[4] = hdr[1]; pn[5] = hdr[0]; } static int ccmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb, unsigned int mic_len) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_key *key = tx->key; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int hdrlen, len, tail; u8 *pos; u8 pn[6]; u64 pn64; u8 aad[CCM_AAD_LEN]; u8 b_0[AES_BLOCK_SIZE]; if (info->control.hw_key && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && !((info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV_MGMT) && ieee80211_is_mgmt(hdr->frame_control))) { /* * hwaccel has no need for preallocated room for CCMP * header or MIC fields */ return 0; } hdrlen = ieee80211_hdrlen(hdr->frame_control); len = skb->len - hdrlen; if (info->control.hw_key) tail = 0; else tail = mic_len; if (WARN_ON(skb_tailroom(skb) < tail || skb_headroom(skb) < IEEE80211_CCMP_HDR_LEN)) return -1; pos = skb_push(skb, IEEE80211_CCMP_HDR_LEN); memmove(pos, pos + IEEE80211_CCMP_HDR_LEN, hdrlen); /* the HW only needs room for the IV, but not the actual IV */ if (info->control.hw_key && (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) return 0; hdr = (struct ieee80211_hdr *) pos; pos += hdrlen; pn64 = atomic64_inc_return(&key->conf.tx_pn); pn[5] = pn64; pn[4] = pn64 >> 8; pn[3] = pn64 >> 16; pn[2] = pn64 >> 24; pn[1] = pn64 >> 32; pn[0] = pn64 >> 40; ccmp_pn2hdr(pos, pn, key->conf.keyidx); /* hwaccel - with software CCMP header */ if (info->control.hw_key) return 0; pos += IEEE80211_CCMP_HDR_LEN; ccmp_special_blocks(skb, pn, b_0, aad); return ieee80211_aes_ccm_encrypt(key->u.ccmp.tfm, b_0, aad, pos, len, skb_put(skb, mic_len)); } ieee80211_tx_result ieee80211_crypto_ccmp_encrypt(struct ieee80211_tx_data *tx, unsigned int mic_len) { struct sk_buff *skb; ieee80211_tx_set_protected(tx); skb_queue_walk(&tx->skbs, skb) { if (ccmp_encrypt_skb(tx, skb, mic_len) < 0) return TX_DROP; } return TX_CONTINUE; } ieee80211_rx_result ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, unsigned int mic_len) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; int hdrlen; struct ieee80211_key *key = rx->key; struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); u8 pn[IEEE80211_CCMP_PN_LEN]; int data_len; int queue; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (!ieee80211_is_data(hdr->frame_control) && !ieee80211_is_robust_mgmt_frame(skb)) return RX_CONTINUE; if (status->flag & RX_FLAG_DECRYPTED) { if (!pskb_may_pull(rx->skb, hdrlen + IEEE80211_CCMP_HDR_LEN)) return RX_DROP_UNUSABLE; if (status->flag & RX_FLAG_MIC_STRIPPED) mic_len = 0; } else { if (skb_linearize(rx->skb)) return RX_DROP_UNUSABLE; } /* reload hdr - skb might have been reallocated */ hdr = (void *)rx->skb->data; data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; if (!(status->flag & RX_FLAG_PN_VALIDATED)) { int res; ccmp_hdr2pn(pn, skb->data + hdrlen); queue = rx->security_idx; res = memcmp(pn, key->u.ccmp.rx_pn[queue], IEEE80211_CCMP_PN_LEN); if (res < 0 || (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) { key->u.ccmp.replays++; return RX_DROP_UNUSABLE; } if (!(status->flag & RX_FLAG_DECRYPTED)) { u8 aad[2 * AES_BLOCK_SIZE]; u8 b_0[AES_BLOCK_SIZE]; /* hardware didn't decrypt/verify MIC */ ccmp_special_blocks(skb, pn, b_0, aad); if (ieee80211_aes_ccm_decrypt( key->u.ccmp.tfm, b_0, aad, skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN, data_len, skb->data + skb->len - mic_len)) return RX_DROP_UNUSABLE; } memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN); if (unlikely(ieee80211_is_frag(hdr))) memcpy(rx->ccm_gcm.pn, pn, IEEE80211_CCMP_PN_LEN); } /* Remove CCMP header and MIC */ if (pskb_trim(skb, skb->len - mic_len)) return RX_DROP_UNUSABLE; memmove(skb->data + IEEE80211_CCMP_HDR_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_CCMP_HDR_LEN); return RX_CONTINUE; } static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad) { __le16 mask_fc; u8 qos_tid; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; memcpy(j_0, hdr->addr2, ETH_ALEN); memcpy(&j_0[ETH_ALEN], pn, IEEE80211_GCMP_PN_LEN); j_0[13] = 0; j_0[14] = 0; j_0[AES_BLOCK_SIZE - 1] = 0x01; /* AAD (extra authenticate-only data) / masked 802.11 header * FC | A1 | A2 | A3 | SC | [A4] | [QC] */ put_unaligned_be16(ieee80211_hdrlen(hdr->frame_control) - 2, &aad[0]); /* Mask FC: zero subtype b4 b5 b6 (if not mgmt) * Retry, PwrMgt, MoreData; set Protected */ mask_fc = hdr->frame_control; mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY | IEEE80211_FCTL_PM | IEEE80211_FCTL_MOREDATA); if (!ieee80211_is_mgmt(hdr->frame_control)) mask_fc &= ~cpu_to_le16(0x0070); mask_fc |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); put_unaligned(mask_fc, (__le16 *)&aad[2]); memcpy(&aad[4], &hdr->addr1, 3 * ETH_ALEN); /* Mask Seq#, leave Frag# */ aad[22] = *((u8 *)&hdr->seq_ctrl) & 0x0f; aad[23] = 0; if (ieee80211_is_data_qos(hdr->frame_control)) qos_tid = ieee80211_get_tid(hdr); else qos_tid = 0; if (ieee80211_has_a4(hdr->frame_control)) { memcpy(&aad[24], hdr->addr4, ETH_ALEN); aad[30] = qos_tid; aad[31] = 0; } else { memset(&aad[24], 0, ETH_ALEN + IEEE80211_QOS_CTL_LEN); aad[24] = qos_tid; } } static inline void gcmp_pn2hdr(u8 *hdr, const u8 *pn, int key_id) { hdr[0] = pn[5]; hdr[1] = pn[4]; hdr[2] = 0; hdr[3] = 0x20 | (key_id << 6); hdr[4] = pn[3]; hdr[5] = pn[2]; hdr[6] = pn[1]; hdr[7] = pn[0]; } static inline void gcmp_hdr2pn(u8 *pn, const u8 *hdr) { pn[0] = hdr[7]; pn[1] = hdr[6]; pn[2] = hdr[5]; pn[3] = hdr[4]; pn[4] = hdr[1]; pn[5] = hdr[0]; } static int gcmp_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; struct ieee80211_key *key = tx->key; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int hdrlen, len, tail; u8 *pos; u8 pn[6]; u64 pn64; u8 aad[GCM_AAD_LEN]; u8 j_0[AES_BLOCK_SIZE]; if (info->control.hw_key && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE) && !((info->control.hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV_MGMT) && ieee80211_is_mgmt(hdr->frame_control))) { /* hwaccel has no need for preallocated room for GCMP * header or MIC fields */ return 0; } hdrlen = ieee80211_hdrlen(hdr->frame_control); len = skb->len - hdrlen; if (info->control.hw_key) tail = 0; else tail = IEEE80211_GCMP_MIC_LEN; if (WARN_ON(skb_tailroom(skb) < tail || skb_headroom(skb) < IEEE80211_GCMP_HDR_LEN)) return -1; pos = skb_push(skb, IEEE80211_GCMP_HDR_LEN); memmove(pos, pos + IEEE80211_GCMP_HDR_LEN, hdrlen); skb_set_network_header(skb, skb_network_offset(skb) + IEEE80211_GCMP_HDR_LEN); /* the HW only needs room for the IV, but not the actual IV */ if (info->control.hw_key && (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) return 0; hdr = (struct ieee80211_hdr *)pos; pos += hdrlen; pn64 = atomic64_inc_return(&key->conf.tx_pn); pn[5] = pn64; pn[4] = pn64 >> 8; pn[3] = pn64 >> 16; pn[2] = pn64 >> 24; pn[1] = pn64 >> 32; pn[0] = pn64 >> 40; gcmp_pn2hdr(pos, pn, key->conf.keyidx); /* hwaccel - with software GCMP header */ if (info->control.hw_key) return 0; pos += IEEE80211_GCMP_HDR_LEN; gcmp_special_blocks(skb, pn, j_0, aad); return ieee80211_aes_gcm_encrypt(key->u.gcmp.tfm, j_0, aad, pos, len, skb_put(skb, IEEE80211_GCMP_MIC_LEN)); } ieee80211_tx_result ieee80211_crypto_gcmp_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; ieee80211_tx_set_protected(tx); skb_queue_walk(&tx->skbs, skb) { if (gcmp_encrypt_skb(tx, skb) < 0) return TX_DROP; } return TX_CONTINUE; } ieee80211_rx_result ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; int hdrlen; struct ieee80211_key *key = rx->key; struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); u8 pn[IEEE80211_GCMP_PN_LEN]; int data_len, queue, mic_len = IEEE80211_GCMP_MIC_LEN; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (!ieee80211_is_data(hdr->frame_control) && !ieee80211_is_robust_mgmt_frame(skb)) return RX_CONTINUE; if (status->flag & RX_FLAG_DECRYPTED) { if (!pskb_may_pull(rx->skb, hdrlen + IEEE80211_GCMP_HDR_LEN)) return RX_DROP_UNUSABLE; if (status->flag & RX_FLAG_MIC_STRIPPED) mic_len = 0; } else { if (skb_linearize(rx->skb)) return RX_DROP_UNUSABLE; } /* reload hdr - skb might have been reallocated */ hdr = (void *)rx->skb->data; data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; if (!(status->flag & RX_FLAG_PN_VALIDATED)) { int res; gcmp_hdr2pn(pn, skb->data + hdrlen); queue = rx->security_idx; res = memcmp(pn, key->u.gcmp.rx_pn[queue], IEEE80211_GCMP_PN_LEN); if (res < 0 || (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) { key->u.gcmp.replays++; return RX_DROP_UNUSABLE; } if (!(status->flag & RX_FLAG_DECRYPTED)) { u8 aad[2 * AES_BLOCK_SIZE]; u8 j_0[AES_BLOCK_SIZE]; /* hardware didn't decrypt/verify MIC */ gcmp_special_blocks(skb, pn, j_0, aad); if (ieee80211_aes_gcm_decrypt( key->u.gcmp.tfm, j_0, aad, skb->data + hdrlen + IEEE80211_GCMP_HDR_LEN, data_len, skb->data + skb->len - IEEE80211_GCMP_MIC_LEN)) return RX_DROP_UNUSABLE; } memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN); if (unlikely(ieee80211_is_frag(hdr))) memcpy(rx->ccm_gcm.pn, pn, IEEE80211_CCMP_PN_LEN); } /* Remove GCMP header and MIC */ if (pskb_trim(skb, skb->len - mic_len)) return RX_DROP_UNUSABLE; memmove(skb->data + IEEE80211_GCMP_HDR_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_GCMP_HDR_LEN); return RX_CONTINUE; } static ieee80211_tx_result ieee80211_crypto_cs_encrypt(struct ieee80211_tx_data *tx, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; struct ieee80211_key *key = tx->key; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int hdrlen; u8 *pos, iv_len = key->conf.iv_len; if (info->control.hw_key && !(info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) { /* hwaccel has no need for preallocated head room */ return TX_CONTINUE; } if (unlikely(skb_headroom(skb) < iv_len && pskb_expand_head(skb, iv_len, 0, GFP_ATOMIC))) return TX_DROP; hdrlen = ieee80211_hdrlen(hdr->frame_control); pos = skb_push(skb, iv_len); memmove(pos, pos + iv_len, hdrlen); return TX_CONTINUE; } static inline int ieee80211_crypto_cs_pn_compare(u8 *pn1, u8 *pn2, int len) { int i; /* pn is little endian */ for (i = len - 1; i >= 0; i--) { if (pn1[i] < pn2[i]) return -1; else if (pn1[i] > pn2[i]) return 1; } return 0; } static ieee80211_rx_result ieee80211_crypto_cs_decrypt(struct ieee80211_rx_data *rx) { struct ieee80211_key *key = rx->key; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; const struct ieee80211_cipher_scheme *cs = NULL; int hdrlen = ieee80211_hdrlen(hdr->frame_control); struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); int data_len; u8 *rx_pn; u8 *skb_pn; u8 qos_tid; if (!rx->sta || !rx->sta->cipher_scheme || !(status->flag & RX_FLAG_DECRYPTED)) return RX_DROP_UNUSABLE; if (!ieee80211_is_data(hdr->frame_control)) return RX_CONTINUE; cs = rx->sta->cipher_scheme; data_len = rx->skb->len - hdrlen - cs->hdr_len; if (data_len < 0) return RX_DROP_UNUSABLE; if (ieee80211_is_data_qos(hdr->frame_control)) qos_tid = ieee80211_get_tid(hdr); else qos_tid = 0; if (skb_linearize(rx->skb)) return RX_DROP_UNUSABLE; hdr = (struct ieee80211_hdr *)rx->skb->data; rx_pn = key->u.gen.rx_pn[qos_tid]; skb_pn = rx->skb->data + hdrlen + cs->pn_off; if (ieee80211_crypto_cs_pn_compare(skb_pn, rx_pn, cs->pn_len) <= 0) return RX_DROP_UNUSABLE; memcpy(rx_pn, skb_pn, cs->pn_len); /* remove security header and MIC */ if (pskb_trim(rx->skb, rx->skb->len - cs->mic_len)) return RX_DROP_UNUSABLE; memmove(rx->skb->data + cs->hdr_len, rx->skb->data, hdrlen); skb_pull(rx->skb, cs->hdr_len); return RX_CONTINUE; } static void bip_aad(struct sk_buff *skb, u8 *aad) { __le16 mask_fc; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; /* BIP AAD: FC(masked) || A1 || A2 || A3 */ /* FC type/subtype */ /* Mask FC Retry, PwrMgt, MoreData flags to zero */ mask_fc = hdr->frame_control; mask_fc &= ~cpu_to_le16(IEEE80211_FCTL_RETRY | IEEE80211_FCTL_PM | IEEE80211_FCTL_MOREDATA); put_unaligned(mask_fc, (__le16 *) &aad[0]); /* A1 || A2 || A3 */ memcpy(aad + 2, &hdr->addr1, 3 * ETH_ALEN); } static inline void bip_ipn_set64(u8 *d, u64 pn) { *d++ = pn; *d++ = pn >> 8; *d++ = pn >> 16; *d++ = pn >> 24; *d++ = pn >> 32; *d = pn >> 40; } static inline void bip_ipn_swap(u8 *d, const u8 *s) { *d++ = s[5]; *d++ = s[4]; *d++ = s[3]; *d++ = s[2]; *d++ = s[1]; *d = s[0]; } ieee80211_tx_result ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_tx_info *info; struct ieee80211_key *key = tx->key; struct ieee80211_mmie *mmie; u8 aad[20]; u64 pn64; if (WARN_ON(skb_queue_len(&tx->skbs) != 1)) return TX_DROP; skb = skb_peek(&tx->skbs); info = IEEE80211_SKB_CB(skb); if (info->control.hw_key) return TX_CONTINUE; if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) return TX_DROP; mmie = skb_put(skb, sizeof(*mmie)); mmie->element_id = WLAN_EID_MMIE; mmie->length = sizeof(*mmie) - 2; mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); bip_aad(skb, aad); /* * MIC = AES-128-CMAC(IGTK, AAD || Management Frame Body || MMIE, 64) */ ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad, skb->data + 24, skb->len - 24, mmie->mic); return TX_CONTINUE; } ieee80211_tx_result ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_tx_info *info; struct ieee80211_key *key = tx->key; struct ieee80211_mmie_16 *mmie; u8 aad[20]; u64 pn64; if (WARN_ON(skb_queue_len(&tx->skbs) != 1)) return TX_DROP; skb = skb_peek(&tx->skbs); info = IEEE80211_SKB_CB(skb); if (info->control.hw_key) return TX_CONTINUE; if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) return TX_DROP; mmie = skb_put(skb, sizeof(*mmie)); mmie->element_id = WLAN_EID_MMIE; mmie->length = sizeof(*mmie) - 2; mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); bip_aad(skb, aad); /* MIC = AES-256-CMAC(IGTK, AAD || Management Frame Body || MMIE, 128) */ ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad, skb->data + 24, skb->len - 24, mmie->mic); return TX_CONTINUE; } ieee80211_rx_result ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx) { struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_key *key = rx->key; struct ieee80211_mmie *mmie; u8 aad[20], mic[8], ipn[6]; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; if (!ieee80211_is_mgmt(hdr->frame_control)) return RX_CONTINUE; /* management frames are already linear */ if (skb->len < 24 + sizeof(*mmie)) return RX_DROP_UNUSABLE; mmie = (struct ieee80211_mmie *) (skb->data + skb->len - sizeof(*mmie)); if (mmie->element_id != WLAN_EID_MMIE || mmie->length != sizeof(*mmie) - 2) return RX_DROP_UNUSABLE; /* Invalid MMIE */ bip_ipn_swap(ipn, mmie->sequence_number); if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) { key->u.aes_cmac.replays++; return RX_DROP_UNUSABLE; } if (!(status->flag & RX_FLAG_DECRYPTED)) { /* hardware didn't decrypt/verify MIC */ bip_aad(skb, aad); ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad, skb->data + 24, skb->len - 24, mic); if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) { key->u.aes_cmac.icverrors++; return RX_DROP_UNUSABLE; } } memcpy(key->u.aes_cmac.rx_pn, ipn, 6); /* Remove MMIE */ skb_trim(skb, skb->len - sizeof(*mmie)); return RX_CONTINUE; } ieee80211_rx_result ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx) { struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_key *key = rx->key; struct ieee80211_mmie_16 *mmie; u8 aad[20], mic[16], ipn[6]; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; if (!ieee80211_is_mgmt(hdr->frame_control)) return RX_CONTINUE; /* management frames are already linear */ if (skb->len < 24 + sizeof(*mmie)) return RX_DROP_UNUSABLE; mmie = (struct ieee80211_mmie_16 *) (skb->data + skb->len - sizeof(*mmie)); if (mmie->element_id != WLAN_EID_MMIE || mmie->length != sizeof(*mmie) - 2) return RX_DROP_UNUSABLE; /* Invalid MMIE */ bip_ipn_swap(ipn, mmie->sequence_number); if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) { key->u.aes_cmac.replays++; return RX_DROP_UNUSABLE; } if (!(status->flag & RX_FLAG_DECRYPTED)) { /* hardware didn't decrypt/verify MIC */ bip_aad(skb, aad); ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad, skb->data + 24, skb->len - 24, mic); if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) { key->u.aes_cmac.icverrors++; return RX_DROP_UNUSABLE; } } memcpy(key->u.aes_cmac.rx_pn, ipn, 6); /* Remove MMIE */ skb_trim(skb, skb->len - sizeof(*mmie)); return RX_CONTINUE; } ieee80211_tx_result ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_tx_info *info; struct ieee80211_key *key = tx->key; struct ieee80211_mmie_16 *mmie; struct ieee80211_hdr *hdr; u8 aad[GMAC_AAD_LEN]; u64 pn64; u8 nonce[GMAC_NONCE_LEN]; if (WARN_ON(skb_queue_len(&tx->skbs) != 1)) return TX_DROP; skb = skb_peek(&tx->skbs); info = IEEE80211_SKB_CB(skb); if (info->control.hw_key) return TX_CONTINUE; if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) return TX_DROP; mmie = skb_put(skb, sizeof(*mmie)); mmie->element_id = WLAN_EID_MMIE; mmie->length = sizeof(*mmie) - 2; mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ pn64 = atomic64_inc_return(&key->conf.tx_pn); bip_ipn_set64(mmie->sequence_number, pn64); bip_aad(skb, aad); hdr = (struct ieee80211_hdr *)skb->data; memcpy(nonce, hdr->addr2, ETH_ALEN); bip_ipn_swap(nonce + ETH_ALEN, mmie->sequence_number); /* MIC = AES-GMAC(IGTK, AAD || Management Frame Body || MMIE, 128) */ if (ieee80211_aes_gmac(key->u.aes_gmac.tfm, aad, nonce, skb->data + 24, skb->len - 24, mmie->mic) < 0) return TX_DROP; return TX_CONTINUE; } ieee80211_rx_result ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx) { struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_key *key = rx->key; struct ieee80211_mmie_16 *mmie; u8 aad[GMAC_AAD_LEN], *mic, ipn[6], nonce[GMAC_NONCE_LEN]; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; if (!ieee80211_is_mgmt(hdr->frame_control)) return RX_CONTINUE; /* management frames are already linear */ if (skb->len < 24 + sizeof(*mmie)) return RX_DROP_UNUSABLE; mmie = (struct ieee80211_mmie_16 *) (skb->data + skb->len - sizeof(*mmie)); if (mmie->element_id != WLAN_EID_MMIE || mmie->length != sizeof(*mmie) - 2) return RX_DROP_UNUSABLE; /* Invalid MMIE */ bip_ipn_swap(ipn, mmie->sequence_number); if (memcmp(ipn, key->u.aes_gmac.rx_pn, 6) <= 0) { key->u.aes_gmac.replays++; return RX_DROP_UNUSABLE; } if (!(status->flag & RX_FLAG_DECRYPTED)) { /* hardware didn't decrypt/verify MIC */ bip_aad(skb, aad); memcpy(nonce, hdr->addr2, ETH_ALEN); memcpy(nonce + ETH_ALEN, ipn, 6); mic = kmalloc(GMAC_MIC_LEN, GFP_ATOMIC); if (!mic) return RX_DROP_UNUSABLE; if (ieee80211_aes_gmac(key->u.aes_gmac.tfm, aad, nonce, skb->data + 24, skb->len - 24, mic) < 0 || crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) { key->u.aes_gmac.icverrors++; kfree(mic); return RX_DROP_UNUSABLE; } kfree(mic); } memcpy(key->u.aes_gmac.rx_pn, ipn, 6); /* Remove MMIE */ skb_trim(skb, skb->len - sizeof(*mmie)); return RX_CONTINUE; } ieee80211_tx_result ieee80211_crypto_hw_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; struct ieee80211_tx_info *info = NULL; ieee80211_tx_result res; skb_queue_walk(&tx->skbs, skb) { info = IEEE80211_SKB_CB(skb); /* handle hw-only algorithm */ if (!info->control.hw_key) return TX_DROP; if (tx->key->flags & KEY_FLAG_CIPHER_SCHEME) { res = ieee80211_crypto_cs_encrypt(tx, skb); if (res != TX_CONTINUE) return res; } } ieee80211_tx_set_protected(tx); return TX_CONTINUE; } ieee80211_rx_result ieee80211_crypto_hw_decrypt(struct ieee80211_rx_data *rx) { if (rx->sta && rx->sta->cipher_scheme) return ieee80211_crypto_cs_decrypt(rx); return RX_DROP_UNUSABLE; } |
9 7 10 5 9 9 9 9 9 9 3 9 11 11 3 9 11 9 9 11 9 11 13 12 8 8 1 1 3 8 41 1 36 36 36 33 36 33 33 33 33 33 5 4 3 2 4 1 36 36 36 36 36 36 40 34 34 34 17 34 34 16 2 17 18 48 47 46 45 44 44 5 40 7 17 15 14 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 | /* * fs/inotify_user.c - inotify support for userspace * * Authors: * John McCutchan <ttb@tentacle.dhs.org> * Robert Love <rml@novell.com> * * Copyright (C) 2005 John McCutchan * Copyright 2006 Hewlett-Packard Development Company, L.P. * * Copyright (C) 2009 Eric Paris <Red Hat Inc> * inotify was largely rewriten to make use of the fsnotify infrastructure * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2, or (at your option) any * later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include <linux/file.h> #include <linux/fs.h> /* struct inode */ #include <linux/fsnotify_backend.h> #include <linux/idr.h> #include <linux/init.h> /* fs_initcall */ #include <linux/inotify.h> #include <linux/kernel.h> /* roundup() */ #include <linux/namei.h> /* LOOKUP_FOLLOW */ #include <linux/sched/signal.h> #include <linux/slab.h> /* struct kmem_cache */ #include <linux/syscalls.h> #include <linux/types.h> #include <linux/anon_inodes.h> #include <linux/uaccess.h> #include <linux/poll.h> #include <linux/wait.h> #include <linux/memcontrol.h> #include "inotify.h" #include "../fdinfo.h" #include <asm/ioctls.h> /* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; struct kmem_cache *inotify_inode_mark_cachep __read_mostly; #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static int zero; struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, { .procname = "max_user_watches", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES], .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, { .procname = "max_queued_events", .data = &inotify_max_queued_events, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero }, { } }; #endif /* CONFIG_SYSCTL */ static inline __u32 inotify_arg_to_mask(u32 arg) { __u32 mask; /* * everything should accept their own ignored, cares about children, * and should receive events when the inode is unmounted */ mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT); /* mask off the flags used to open the fd */ mask |= (arg & INOTIFY_USER_MASK); return mask; } static inline u32 inotify_mask_to_arg(__u32 mask) { return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED | IN_Q_OVERFLOW); } /* intofiy userspace file descriptor functions */ static __poll_t inotify_poll(struct file *file, poll_table *wait) { struct fsnotify_group *group = file->private_data; __poll_t ret = 0; poll_wait(file, &group->notification_waitq, wait); spin_lock(&group->notification_lock); if (!fsnotify_notify_queue_is_empty(group)) ret = EPOLLIN | EPOLLRDNORM; spin_unlock(&group->notification_lock); return ret; } static int round_event_name_len(struct fsnotify_event *fsn_event) { struct inotify_event_info *event; event = INOTIFY_E(fsn_event); if (!event->name_len) return 0; return roundup(event->name_len + 1, sizeof(struct inotify_event)); } /* * Get an inotify_kernel_event if one exists and is small * enough to fit in "count". Return an error pointer if * not large enough. * * Called with the group->notification_lock held. */ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) { size_t event_size = sizeof(struct inotify_event); struct fsnotify_event *event; if (fsnotify_notify_queue_is_empty(group)) return NULL; event = fsnotify_peek_first_event(group); pr_debug("%s: group=%p event=%p\n", __func__, group, event); event_size += round_event_name_len(event); if (event_size > count) return ERR_PTR(-EINVAL); /* held the notification_lock the whole time, so this is the * same event we peeked above */ fsnotify_remove_first_event(group); return event; } /* * Copy an event to user space, returning how much we copied. * * We already checked that the event size is smaller than the * buffer we had in "get_one_event()" above. */ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fsnotify_event *fsn_event, char __user *buf) { struct inotify_event inotify_event; struct inotify_event_info *event; size_t event_size = sizeof(struct inotify_event); size_t name_len; size_t pad_name_len; pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); event = INOTIFY_E(fsn_event); name_len = event->name_len; /* * round up name length so it is a multiple of event_size * plus an extra byte for the terminating '\0'. */ pad_name_len = round_event_name_len(fsn_event); inotify_event.len = pad_name_len; inotify_event.mask = inotify_mask_to_arg(fsn_event->mask); inotify_event.wd = event->wd; inotify_event.cookie = event->sync_cookie; /* send the main event */ if (copy_to_user(buf, &inotify_event, event_size)) return -EFAULT; buf += event_size; /* * fsnotify only stores the pathname, so here we have to send the pathname * and then pad that pathname out to a multiple of sizeof(inotify_event) * with zeros. */ if (pad_name_len) { /* copy the path name */ if (copy_to_user(buf, event->name, name_len)) return -EFAULT; buf += name_len; /* fill userspace with 0's */ if (clear_user(buf, pad_name_len - name_len)) return -EFAULT; event_size += pad_name_len; } return event_size; } static ssize_t inotify_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct fsnotify_group *group; struct fsnotify_event *kevent; char __user *start; int ret; DEFINE_WAIT_FUNC(wait, woken_wake_function); start = buf; group = file->private_data; add_wait_queue(&group->notification_waitq, &wait); while (1) { spin_lock(&group->notification_lock); kevent = get_one_event(group, count); spin_unlock(&group->notification_lock); pr_debug("%s: group=%p kevent=%p\n", __func__, group, kevent); if (kevent) { ret = PTR_ERR(kevent); if (IS_ERR(kevent)) break; ret = copy_event_to_user(group, kevent, buf); fsnotify_destroy_event(group, kevent); if (ret < 0) break; buf += ret; count -= ret; continue; } ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break; ret = -ERESTARTSYS; if (signal_pending(current)) break; if (start != buf) break; wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&group->notification_waitq, &wait); if (start != buf && ret != -EFAULT) ret = buf - start; return ret; } static int inotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; pr_debug("%s: group=%p\n", __func__, group); /* free this group, matching get was inotify_init->fsnotify_obtain_group */ fsnotify_destroy_group(group); return 0; } static long inotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fsnotify_group *group; struct fsnotify_event *fsn_event; void __user *p; int ret = -ENOTTY; size_t send_len = 0; group = file->private_data; p = (void __user *) arg; pr_debug("%s: group=%p cmd=%u\n", __func__, group, cmd); switch (cmd) { case FIONREAD: spin_lock(&group->notification_lock); list_for_each_entry(fsn_event, &group->notification_list, list) { send_len += sizeof(struct inotify_event); send_len += round_event_name_len(fsn_event); } spin_unlock(&group->notification_lock); ret = put_user(send_len, (int __user *) p); break; #ifdef CONFIG_CHECKPOINT_RESTORE case INOTIFY_IOC_SETNEXTWD: ret = -EINVAL; if (arg >= 1 && arg <= INT_MAX) { struct inotify_group_private_data *data; data = &group->inotify_data; spin_lock(&data->idr_lock); idr_set_cursor(&data->idr, (unsigned int)arg); spin_unlock(&data->idr_lock); ret = 0; } break; #endif /* CONFIG_CHECKPOINT_RESTORE */ } return ret; } static const struct file_operations inotify_fops = { .show_fdinfo = inotify_show_fdinfo, .poll = inotify_poll, .read = inotify_read, .fasync = fsnotify_fasync, .release = inotify_release, .unlocked_ioctl = inotify_ioctl, .compat_ioctl = inotify_ioctl, .llseek = noop_llseek, }; /* * find_inode - resolve a user-given path to a specific inode */ static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags) { int error; error = user_path_at(AT_FDCWD, dirname, flags, path); if (error) return error; /* you can only watch an inode if you have read permissions on it */ error = inode_permission(path->dentry->d_inode, MAY_READ); if (error) path_put(path); return error; } static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock, struct inotify_inode_mark *i_mark) { int ret; idr_preload(GFP_KERNEL); spin_lock(idr_lock); ret = idr_alloc_cyclic(idr, i_mark, 1, 0, GFP_NOWAIT); if (ret >= 0) { /* we added the mark to the idr, take a reference */ i_mark->wd = ret; fsnotify_get_mark(&i_mark->fsn_mark); } spin_unlock(idr_lock); idr_preload_end(); return ret < 0 ? ret : 0; } static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group *group, int wd) { struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; struct inotify_inode_mark *i_mark; assert_spin_locked(idr_lock); i_mark = idr_find(idr, wd); if (i_mark) { struct fsnotify_mark *fsn_mark = &i_mark->fsn_mark; fsnotify_get_mark(fsn_mark); /* One ref for being in the idr, one ref we just took */ BUG_ON(refcount_read(&fsn_mark->refcnt) < 2); } return i_mark; } static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group, int wd) { struct inotify_inode_mark *i_mark; spinlock_t *idr_lock = &group->inotify_data.idr_lock; spin_lock(idr_lock); i_mark = inotify_idr_find_locked(group, wd); spin_unlock(idr_lock); return i_mark; } /* * Remove the mark from the idr (if present) and drop the reference * on the mark because it was in the idr. */ static void inotify_remove_from_idr(struct fsnotify_group *group, struct inotify_inode_mark *i_mark) { struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; struct inotify_inode_mark *found_i_mark = NULL; int wd; spin_lock(idr_lock); wd = i_mark->wd; /* * does this i_mark think it is in the idr? we shouldn't get called * if it wasn't.... */ if (wd == -1) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); goto out; } /* Lets look in the idr to see if we find it */ found_i_mark = inotify_idr_find_locked(group, wd); if (unlikely(!found_i_mark)) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); goto out; } /* * We found an mark in the idr at the right wd, but it's * not the mark we were told to remove. eparis seriously * fucked up somewhere. */ if (unlikely(found_i_mark != i_mark)) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p " "found_i_mark=%p found_i_mark->wd=%d " "found_i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, found_i_mark, found_i_mark->wd, found_i_mark->fsn_mark.group); goto out; } /* * One ref for being in the idr * one ref grabbed by inotify_idr_find */ if (unlikely(refcount_read(&i_mark->fsn_mark.refcnt) < 2)) { printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); /* we can't really recover with bad ref cnting.. */ BUG(); } idr_remove(idr, wd); /* Removed from the idr, drop that ref. */ fsnotify_put_mark(&i_mark->fsn_mark); out: i_mark->wd = -1; spin_unlock(idr_lock); /* match the ref taken by inotify_idr_find_locked() */ if (found_i_mark) fsnotify_put_mark(&found_i_mark->fsn_mark); } /* * Send IN_IGNORED for this wd, remove this wd from the idr. */ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { struct inotify_inode_mark *i_mark; struct fsnotify_iter_info iter_info = { }; fsnotify_iter_set_report_type_mark(&iter_info, FSNOTIFY_OBJ_TYPE_INODE, fsn_mark); /* Queue ignore event for the watch */ inotify_handle_event(group, NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0, &iter_info); i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); dec_inotify_watches(group->inotify_data.ucounts); } static int inotify_update_existing_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) { struct fsnotify_mark *fsn_mark; struct inotify_inode_mark *i_mark; __u32 old_mask, new_mask; __u32 mask; int add = (arg & IN_MASK_ADD); int create = (arg & IN_MASK_CREATE); int ret; mask = inotify_arg_to_mask(arg); fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) return -ENOENT; else if (create) { ret = -EEXIST; goto out; } i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); spin_lock(&fsn_mark->lock); old_mask = fsn_mark->mask; if (add) fsn_mark->mask |= mask; else fsn_mark->mask = mask; new_mask = fsn_mark->mask; spin_unlock(&fsn_mark->lock); if (old_mask != new_mask) { /* more bits in old than in new? */ int dropped = (old_mask & ~new_mask); /* more bits in this fsn_mark than the inode's mask? */ int do_inode = (new_mask & ~inode->i_fsnotify_mask); /* update the inode with this new fsn_mark */ if (dropped || do_inode) fsnotify_recalc_mask(inode->i_fsnotify_marks); } /* return the wd */ ret = i_mark->wd; out: /* match the get from fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); return ret; } static int inotify_new_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) { struct inotify_inode_mark *tmp_i_mark; __u32 mask; int ret; struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; mask = inotify_arg_to_mask(arg); tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); if (unlikely(!tmp_i_mark)) return -ENOMEM; fsnotify_init_mark(&tmp_i_mark->fsn_mark, group); tmp_i_mark->fsn_mark.mask = mask; tmp_i_mark->wd = -1; ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); if (ret) goto out_err; /* increment the number of watches the user has */ if (!inc_inotify_watches(group->inotify_data.ucounts)) { inotify_remove_from_idr(group, tmp_i_mark); ret = -ENOSPC; goto out_err; } /* we are on the idr, now get on the inode */ ret = fsnotify_add_inode_mark_locked(&tmp_i_mark->fsn_mark, inode, 0); if (ret) { /* we failed to get on the inode, get off the idr */ inotify_remove_from_idr(group, tmp_i_mark); goto out_err; } /* return the watch descriptor for this new mark */ ret = tmp_i_mark->wd; out_err: /* match the ref from fsnotify_init_mark() */ fsnotify_put_mark(&tmp_i_mark->fsn_mark); return ret; } static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) { int ret = 0; mutex_lock(&group->mark_mutex); /* try to update and existing watch with the new arg */ ret = inotify_update_existing_watch(group, inode, arg); /* no mark present, try to add a new one */ if (ret == -ENOENT) ret = inotify_new_watch(group, inode, arg); mutex_unlock(&group->mark_mutex); return ret; } static struct fsnotify_group *inotify_new_group(unsigned int max_events) { struct fsnotify_group *group; struct inotify_event_info *oevent; group = fsnotify_alloc_group(&inotify_fsnotify_ops); if (IS_ERR(group)) return group; oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL); if (unlikely(!oevent)) { fsnotify_destroy_group(group); return ERR_PTR(-ENOMEM); } group->overflow_event = &oevent->fse; fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW); oevent->wd = -1; oevent->sync_cookie = 0; oevent->name_len = 0; group->max_events = max_events; group->memcg = get_mem_cgroup_from_mm(current->mm); spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); group->inotify_data.ucounts = inc_ucount(current_user_ns(), current_euid(), UCOUNT_INOTIFY_INSTANCES); if (!group->inotify_data.ucounts) { fsnotify_destroy_group(group); return ERR_PTR(-EMFILE); } return group; } /* inotify syscalls */ static int do_inotify_init(int flags) { struct fsnotify_group *group; int ret; /* Check the IN_* constants for consistency. */ BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK); if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) return -EINVAL; /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ group = inotify_new_group(inotify_max_queued_events); if (IS_ERR(group)) return PTR_ERR(group); ret = anon_inode_getfd("inotify", &inotify_fops, group, O_RDONLY | flags); if (ret < 0) fsnotify_destroy_group(group); return ret; } SYSCALL_DEFINE1(inotify_init1, int, flags) { return do_inotify_init(flags); } SYSCALL_DEFINE0(inotify_init) { return do_inotify_init(0); } SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, u32, mask) { struct fsnotify_group *group; struct inode *inode; struct path path; struct fd f; int ret; unsigned flags = 0; /* * We share a lot of code with fs/dnotify. We also share * the bit layout between inotify's IN_* and the fsnotify * FS_*. This check ensures that only the inotify IN_* * bits get passed in and set in watches/events. */ if (unlikely(mask & ~ALL_INOTIFY_BITS)) return -EINVAL; /* * Require at least one valid bit set in the mask. * Without _something_ set, we would have no events to * watch for. */ if (unlikely(!(mask & ALL_INOTIFY_BITS))) return -EINVAL; f = fdget(fd); if (unlikely(!f.file)) return -EBADF; /* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */ if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) { ret = -EINVAL; goto fput_and_out; } /* verify that this is indeed an inotify instance */ if (unlikely(f.file->f_op != &inotify_fops)) { ret = -EINVAL; goto fput_and_out; } if (!(mask & IN_DONT_FOLLOW)) flags |= LOOKUP_FOLLOW; if (mask & IN_ONLYDIR) flags |= LOOKUP_DIRECTORY; ret = inotify_find_inode(pathname, &path, flags); if (ret) goto fput_and_out; /* inode held in place by reference to path; group by fget on fd */ inode = path.dentry->d_inode; group = f.file->private_data; /* create/update an inode mark */ ret = inotify_update_watch(group, inode, mask); path_put(&path); fput_and_out: fdput(f); return ret; } SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) { struct fsnotify_group *group; struct inotify_inode_mark *i_mark; struct fd f; int ret = 0; f = fdget(fd); if (unlikely(!f.file)) return -EBADF; /* verify that this is indeed an inotify instance */ ret = -EINVAL; if (unlikely(f.file->f_op != &inotify_fops)) goto out; group = f.file->private_data; ret = -EINVAL; i_mark = inotify_idr_find(group, wd); if (unlikely(!i_mark)) goto out; ret = 0; fsnotify_destroy_mark(&i_mark->fsn_mark, group); /* match ref taken by inotify_idr_find */ fsnotify_put_mark(&i_mark->fsn_mark); out: fdput(f); return ret; } /* * inotify_user_setup - Our initialization function. Note that we cannot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here * must result in panic(). */ static int __init inotify_user_setup(void) { BUILD_BUG_ON(IN_ACCESS != FS_ACCESS); BUILD_BUG_ON(IN_MODIFY != FS_MODIFY); BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB); BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE); BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); BUILD_BUG_ON(IN_OPEN != FS_OPEN); BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM); BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO); BUILD_BUG_ON(IN_CREATE != FS_CREATE); BUILD_BUG_ON(IN_DELETE != FS_DELETE); BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF); BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF); BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT); BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW); BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED); BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK); BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); BUG_ON(hweight32(ALL_INOTIFY_BITS) != 22); inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC|SLAB_ACCOUNT); inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192; return 0; } fs_initcall(inotify_user_setup); |
38 37 37 37 38 14 14 14 12 27 27 27 51 51 51 51 51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 | /* * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ #include <linux/time.h> #include <linux/fs.h> #include "reiserfs.h" #include <linux/string.h> #include <linux/buffer_head.h> #include <stdarg.h> static char error_buf[1024]; static char fmt_buf[1024]; static char off_buf[80]; static char *reiserfs_cpu_offset(struct cpu_key *key) { if (cpu_key_k_type(key) == TYPE_DIRENTRY) sprintf(off_buf, "%llu(%llu)", (unsigned long long) GET_HASH_VALUE(cpu_key_k_offset(key)), (unsigned long long) GET_GENERATION_NUMBER(cpu_key_k_offset(key))); else sprintf(off_buf, "0x%Lx", (unsigned long long)cpu_key_k_offset(key)); return off_buf; } static char *le_offset(struct reiserfs_key *key) { int version; version = le_key_version(key); if (le_key_k_type(version, key) == TYPE_DIRENTRY) sprintf(off_buf, "%llu(%llu)", (unsigned long long) GET_HASH_VALUE(le_key_k_offset(version, key)), (unsigned long long) GET_GENERATION_NUMBER(le_key_k_offset(version, key))); else sprintf(off_buf, "0x%Lx", (unsigned long long)le_key_k_offset(version, key)); return off_buf; } static char *cpu_type(struct cpu_key *key) { if (cpu_key_k_type(key) == TYPE_STAT_DATA) return "SD"; if (cpu_key_k_type(key) == TYPE_DIRENTRY) return "DIR"; if (cpu_key_k_type(key) == TYPE_DIRECT) return "DIRECT"; if (cpu_key_k_type(key) == TYPE_INDIRECT) return "IND"; return "UNKNOWN"; } static char *le_type(struct reiserfs_key *key) { int version; version = le_key_version(key); if (le_key_k_type(version, key) == TYPE_STAT_DATA) return "SD"; if (le_key_k_type(version, key) == TYPE_DIRENTRY) return "DIR"; if (le_key_k_type(version, key) == TYPE_DIRECT) return "DIRECT"; if (le_key_k_type(version, key) == TYPE_INDIRECT) return "IND"; return "UNKNOWN"; } /* %k */ static int scnprintf_le_key(char *buf, size_t size, struct reiserfs_key *key) { if (key) return scnprintf(buf, size, "[%d %d %s %s]", le32_to_cpu(key->k_dir_id), le32_to_cpu(key->k_objectid), le_offset(key), le_type(key)); else return scnprintf(buf, size, "[NULL]"); } /* %K */ static int scnprintf_cpu_key(char *buf, size_t size, struct cpu_key *key) { if (key) return scnprintf(buf, size, "[%d %d %s %s]", key->on_disk_key.k_dir_id, key->on_disk_key.k_objectid, reiserfs_cpu_offset(key), cpu_type(key)); else return scnprintf(buf, size, "[NULL]"); } static int scnprintf_de_head(char *buf, size_t size, struct reiserfs_de_head *deh) { if (deh) return scnprintf(buf, size, "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", deh_offset(deh), deh_dir_id(deh), deh_objectid(deh), deh_location(deh), deh_state(deh)); else return scnprintf(buf, size, "[NULL]"); } static int scnprintf_item_head(char *buf, size_t size, struct item_head *ih) { if (ih) { char *p = buf; char * const end = buf + size; p += scnprintf(p, end - p, "%s", (ih_version(ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*"); p += scnprintf_le_key(p, end - p, &ih->ih_key); p += scnprintf(p, end - p, ", item_len %d, item_location %d, free_space(entry_count) %d", ih_item_len(ih), ih_location(ih), ih_free_space(ih)); return p - buf; } else return scnprintf(buf, size, "[NULL]"); } static int scnprintf_direntry(char *buf, size_t size, struct reiserfs_dir_entry *de) { char name[20]; memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen); name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0; return scnprintf(buf, size, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid); } static int scnprintf_block_head(char *buf, size_t size, struct buffer_head *bh) { return scnprintf(buf, size, "level=%d, nr_items=%d, free_space=%d rdkey ", B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh)); } static int scnprintf_buffer_head(char *buf, size_t size, struct buffer_head *bh) { return scnprintf(buf, size, "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", bh->b_bdev, bh->b_size, (unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)), bh->b_state, bh->b_page, buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE", buffer_dirty(bh) ? "DIRTY" : "CLEAN", buffer_locked(bh) ? "LOCKED" : "UNLOCKED"); } static int scnprintf_disk_child(char *buf, size_t size, struct disk_child *dc) { return scnprintf(buf, size, "[dc_number=%d, dc_size=%u]", dc_block_number(dc), dc_size(dc)); } static char *is_there_reiserfs_struct(char *fmt, int *what) { char *k = fmt; while ((k = strchr(k, '%')) != NULL) { if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' || k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') { *what = k[1]; break; } k++; } return k; } /* * debugging reiserfs we used to print out a lot of different * variables, like keys, item headers, buffer heads etc. Values of * most fields matter. So it took a long time just to write * appropriative printk. With this reiserfs_warning you can use format * specification for complex structures like you used to do with * printfs for integers, doubles and pointers. For instance, to print * out key structure you have to write just: * reiserfs_warning ("bad key %k", key); * instead of * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, * key->k_offset, key->k_uniqueness); */ static DEFINE_SPINLOCK(error_lock); static void prepare_error_buf(const char *fmt, va_list args) { char *fmt1 = fmt_buf; char *k; char *p = error_buf; char * const end = &error_buf[sizeof(error_buf)]; int what; spin_lock(&error_lock); if (WARN_ON(strscpy(fmt_buf, fmt, sizeof(fmt_buf)) < 0)) { strscpy(error_buf, "format string too long", end - error_buf); goto out_unlock; } while ((k = is_there_reiserfs_struct(fmt1, &what)) != NULL) { *k = 0; p += vscnprintf(p, end - p, fmt1, args); switch (what) { case 'k': p += scnprintf_le_key(p, end - p, va_arg(args, struct reiserfs_key *)); break; case 'K': p += scnprintf_cpu_key(p, end - p, va_arg(args, struct cpu_key *)); break; case 'h': p += scnprintf_item_head(p, end - p, va_arg(args, struct item_head *)); break; case 't': p += scnprintf_direntry(p, end - p, va_arg(args, struct reiserfs_dir_entry *)); break; case 'y': p += scnprintf_disk_child(p, end - p, va_arg(args, struct disk_child *)); break; case 'z': p += scnprintf_block_head(p, end - p, va_arg(args, struct buffer_head *)); break; case 'b': p += scnprintf_buffer_head(p, end - p, va_arg(args, struct buffer_head *)); break; case 'a': p += scnprintf_de_head(p, end - p, va_arg(args, struct reiserfs_de_head *)); break; } fmt1 = k + 2; } p += vscnprintf(p, end - p, fmt1, args); out_unlock: spin_unlock(&error_lock); } /* * in addition to usual conversion specifiers this accepts reiserfs * specific conversion specifiers: * %k to print little endian key, * %K to print cpu key, * %h to print item_head, * %t to print directory entry * %z to print block head (arg must be struct buffer_head * * %b to print buffer_head */ #define do_reiserfs_warning(fmt)\ {\ va_list args;\ va_start( args, fmt );\ prepare_error_buf( fmt, args );\ va_end( args );\ } void __reiserfs_warning(struct super_block *sb, const char *id, const char *function, const char *fmt, ...) { do_reiserfs_warning(fmt); if (sb) printk(KERN_WARNING "REISERFS warning (device %s): %s%s%s: " "%s\n", sb->s_id, id ? id : "", id ? " " : "", function, error_buf); else printk(KERN_WARNING "REISERFS warning: %s%s%s: %s\n", id ? id : "", id ? " " : "", function, error_buf); } /* No newline.. reiserfs_info calls can be followed by printk's */ void reiserfs_info(struct super_block *sb, const char *fmt, ...) { do_reiserfs_warning(fmt); if (sb) printk(KERN_NOTICE "REISERFS (device %s): %s", sb->s_id, error_buf); else printk(KERN_NOTICE "REISERFS %s:", error_buf); } /* No newline.. reiserfs_printk calls can be followed by printk's */ static void reiserfs_printk(const char *fmt, ...) { do_reiserfs_warning(fmt); printk(error_buf); } void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...) { #ifdef CONFIG_REISERFS_CHECK do_reiserfs_warning(fmt); if (s) printk(KERN_DEBUG "REISERFS debug (device %s): %s\n", s->s_id, error_buf); else printk(KERN_DEBUG "REISERFS debug: %s\n", error_buf); #endif } /* * The format: * * maintainer-errorid: [function-name:] message * * where errorid is unique to the maintainer and function-name is * optional, is recommended, so that anyone can easily find the bug * with a simple grep for the short to type string * maintainer-errorid. Don't bother with reusing errorids, there are * lots of numbers out there. * * Example: * * reiserfs_panic( * p_sb, "reiser-29: reiserfs_new_blocknrs: " * "one of search_start or rn(%d) is equal to MAX_B_NUM," * "which means that we are optimizing location based on the " * "bogus location of a temp buffer (%p).", * rn, bh * ); * * Regular panic()s sometimes clear the screen before the message can * be read, thus the need for the while loop. * * Numbering scheme for panic used by Vladimir and Anatoly( Hans completely * ignores this scheme, and considers it pointless complexity): * * panics in reiserfs_fs.h have numbers from 1000 to 1999 * super.c 2000 to 2999 * preserve.c (unused) 3000 to 3999 * bitmap.c 4000 to 4999 * stree.c 5000 to 5999 * prints.c 6000 to 6999 * namei.c 7000 to 7999 * fix_nodes.c 8000 to 8999 * dir.c 9000 to 9999 * lbalance.c 10000 to 10999 * ibalance.c 11000 to 11999 not ready * do_balan.c 12000 to 12999 * inode.c 13000 to 13999 * file.c 14000 to 14999 * objectid.c 15000 - 15999 * buffer.c 16000 - 16999 * symlink.c 17000 - 17999 * * . */ void __reiserfs_panic(struct super_block *sb, const char *id, const char *function, const char *fmt, ...) { do_reiserfs_warning(fmt); #ifdef CONFIG_REISERFS_CHECK dump_stack(); #endif if (sb) printk(KERN_WARNING "REISERFS panic (device %s): %s%s%s: %s\n", sb->s_id, id ? id : "", id ? " " : "", function, error_buf); else printk(KERN_WARNING "REISERFS panic: %s%s%s: %s\n", id ? id : "", id ? " " : "", function, error_buf); BUG(); } void __reiserfs_error(struct super_block *sb, const char *id, const char *function, const char *fmt, ...) { do_reiserfs_warning(fmt); BUG_ON(sb == NULL); if (reiserfs_error_panic(sb)) __reiserfs_panic(sb, id, function, error_buf); if (id && id[0]) printk(KERN_CRIT "REISERFS error (device %s): %s %s: %s\n", sb->s_id, id, function, error_buf); else printk(KERN_CRIT "REISERFS error (device %s): %s: %s\n", sb->s_id, function, error_buf); if (sb_rdonly(sb)) return; reiserfs_info(sb, "Remounting filesystem read-only\n"); sb->s_flags |= SB_RDONLY; reiserfs_abort_journal(sb, -EIO); } void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...) { do_reiserfs_warning(fmt); if (reiserfs_error_panic(sb)) { panic(KERN_CRIT "REISERFS panic (device %s): %s\n", sb->s_id, error_buf); } if (reiserfs_is_journal_aborted(SB_JOURNAL(sb))) return; printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id, error_buf); sb->s_flags |= SB_RDONLY; reiserfs_abort_journal(sb, errno); } /* * this prints internal nodes (4 keys/items in line) (dc_number, * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, * dc_size)... */ static int print_internal(struct buffer_head *bh, int first, int last) { struct reiserfs_key *key; struct disk_child *dc; int i; int from, to; if (!B_IS_KEYS_LEVEL(bh)) return 1; check_internal(bh); if (first == -1) { from = 0; to = B_NR_ITEMS(bh); } else { from = first; to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh); } reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); dc = B_N_CHILD(bh, from); reiserfs_printk("PTR %d: %y ", from, dc); for (i = from, key = internal_key(bh, from), dc++; i < to; i++, key++, dc++) { reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc); if (i && i % 4 == 0) printk("\n"); } printk("\n"); return 0; } static int print_leaf(struct buffer_head *bh, int print_mode, int first, int last) { struct block_head *blkh; struct item_head *ih; int i, nr; int from, to; if (!B_IS_ITEMS_LEVEL(bh)) return 1; check_leaf(bh); blkh = B_BLK_HEAD(bh); ih = item_head(bh, 0); nr = blkh_nr_item(blkh); printk ("\n===================================================================\n"); reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh); if (!(print_mode & PRINT_LEAF_ITEMS)) { reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n", &(ih->ih_key), &((ih + nr - 1)->ih_key)); return 0; } if (first < 0 || first > nr - 1) from = 0; else from = first; if (last < 0 || last > nr) to = nr; else to = last; ih += from; printk ("-------------------------------------------------------------------------------\n"); printk ("|##| type | key | ilen | free_space | version | loc |\n"); for (i = from; i < to; i++, ih++) { printk ("-------------------------------------------------------------------------------\n"); reiserfs_printk("|%2d| %h |\n", i, ih); if (print_mode & PRINT_LEAF_ITEMS) op_print_item(ih, ih_item_body(bh, ih)); } printk ("===================================================================\n"); return 0; } char *reiserfs_hashname(int code) { if (code == YURA_HASH) return "rupasov"; if (code == TEA_HASH) return "tea"; if (code == R5_HASH) return "r5"; return "unknown"; } /* return 1 if this is not super block */ static int print_super_block(struct buffer_head *bh) { struct reiserfs_super_block *rs = (struct reiserfs_super_block *)(bh->b_data); int skipped, data_blocks; char *version; if (is_reiserfs_3_5(rs)) { version = "3.5"; } else if (is_reiserfs_3_6(rs)) { version = "3.6"; } else if (is_reiserfs_jr(rs)) { version = ((sb_version(rs) == REISERFS_VERSION_2) ? "3.6" : "3.5"); } else { return 1; } printk("%pg\'s super block is in block %llu\n", bh->b_bdev, (unsigned long long)bh->b_blocknr); printk("Reiserfs version %s\n", version); printk("Block count %u\n", sb_block_count(rs)); printk("Blocksize %d\n", sb_blocksize(rs)); printk("Free blocks %u\n", sb_free_blocks(rs)); /* * FIXME: this would be confusing if * someone stores reiserfs super block in some data block ;) // skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs); */ skipped = bh->b_blocknr; data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) - (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + 1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs); printk ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n" "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs), (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) : sb_reserved_for_journal(rs)), data_blocks); printk("Root block %u\n", sb_root_block(rs)); printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs)); printk("Journal dev %d\n", sb_jp_journal_dev(rs)); printk("Journal orig size %d\n", sb_jp_journal_size(rs)); printk("FS state %d\n", sb_fs_state(rs)); printk("Hash function \"%s\"\n", reiserfs_hashname(sb_hash_function_code(rs))); printk("Tree height %d\n", sb_tree_height(rs)); return 0; } static int print_desc_block(struct buffer_head *bh) { struct reiserfs_journal_desc *desc; if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8)) return 1; desc = (struct reiserfs_journal_desc *)(bh->b_data); printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)", (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc), get_desc_mount_id(desc), get_desc_trans_len(desc)); return 0; } /* ..., int print_mode, int first, int last) */ void print_block(struct buffer_head *bh, ...) { va_list args; int mode, first, last; if (!bh) { printk("print_block: buffer is NULL\n"); return; } va_start(args, bh); mode = va_arg(args, int); first = va_arg(args, int); last = va_arg(args, int); if (print_leaf(bh, mode, first, last)) if (print_internal(bh, first, last)) if (print_super_block(bh)) if (print_desc_block(bh)) printk ("Block %llu contains unformatted data\n", (unsigned long long)bh->b_blocknr); va_end(args); } static char print_tb_buf[2048]; /* this stores initial state of tree balance in the print_tb_buf */ void store_print_tb(struct tree_balance *tb) { int h = 0; int i; struct buffer_head *tbSh, *tbFh; if (!tb) return; sprintf(print_tb_buf, "\n" "BALANCING %d\n" "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n" "=====================================================================\n" "* h * S * L * R * F * FL * FR * CFL * CFR *\n", REISERFS_SB(tb->tb_sb)->s_do_balance, tb->tb_mode, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item); for (h = 0; h < ARRAY_SIZE(tb->insert_size); h++) { if (PATH_H_PATH_OFFSET(tb->tb_path, h) <= tb->tb_path->path_length && PATH_H_PATH_OFFSET(tb->tb_path, h) > ILLEGAL_PATH_ELEMENT_OFFSET) { tbSh = PATH_H_PBUFFER(tb->tb_path, h); tbFh = PATH_H_PPARENT(tb->tb_path, h); } else { tbSh = NULL; tbFh = NULL; } sprintf(print_tb_buf + strlen(print_tb_buf), "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n", h, (tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL), (tbSh) ? atomic_read(&tbSh->b_count) : -1, (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL), (tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1, (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL), (tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1, (tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL), (tb->FL[h]) ? (long long)(tb->FL[h]-> b_blocknr) : (-1LL), (tb->FR[h]) ? (long long)(tb->FR[h]-> b_blocknr) : (-1LL), (tb->CFL[h]) ? (long long)(tb->CFL[h]-> b_blocknr) : (-1LL), (tb->CFR[h]) ? (long long)(tb->CFR[h]-> b_blocknr) : (-1LL)); } sprintf(print_tb_buf + strlen(print_tb_buf), "=====================================================================\n" "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n" "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n", tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0], tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0], tb->sbytes[0], tb->snum[1], tb->sbytes[1], tb->cur_blknum, tb->lkey[0], tb->rkey[0]); /* this prints balance parameters for non-leaf levels */ h = 0; do { h++; sprintf(print_tb_buf + strlen(print_tb_buf), "* %d * %4d * %2d * * %2d * * %2d *\n", h, tb->insert_size[h], tb->lnum[h], tb->rnum[h], tb->blknum[h]); } while (tb->insert_size[h]); sprintf(print_tb_buf + strlen(print_tb_buf), "=====================================================================\n" "FEB list: "); /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */ h = 0; for (i = 0; i < ARRAY_SIZE(tb->FEB); i++) sprintf(print_tb_buf + strlen(print_tb_buf), "%p (%llu %d)%s", tb->FEB[i], tb->FEB[i] ? (unsigned long long)tb->FEB[i]-> b_blocknr : 0ULL, tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0, (i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", "); sprintf(print_tb_buf + strlen(print_tb_buf), "======================== the end ====================================\n"); } void print_cur_tb(char *mes) { printk("%s\n%s", mes, print_tb_buf); } static void check_leaf_block_head(struct buffer_head *bh) { struct block_head *blkh; int nr; blkh = B_BLK_HEAD(bh); nr = blkh_nr_item(blkh); if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE) reiserfs_panic(NULL, "vs-6010", "invalid item number %z", bh); if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr) reiserfs_panic(NULL, "vs-6020", "invalid free space %z", bh); } static void check_internal_block_head(struct buffer_head *bh) { struct block_head *blkh; blkh = B_BLK_HEAD(bh); if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT)) reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh); if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE) reiserfs_panic(NULL, "vs-6030", "invalid item number %z", bh); if (B_FREE_SPACE(bh) != bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) - DC_SIZE * (B_NR_ITEMS(bh) + 1)) reiserfs_panic(NULL, "vs-6040", "invalid free space %z", bh); } void check_leaf(struct buffer_head *bh) { int i; struct item_head *ih; if (!bh) return; check_leaf_block_head(bh); for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++) op_check_item(ih, ih_item_body(bh, ih)); } void check_internal(struct buffer_head *bh) { if (!bh) return; check_internal_block_head(bh); } void print_statistics(struct super_block *s) { /* printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \ bmap with search %d, without %d, dir2ind %d, ind2dir %d\n", REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes, REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search, REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct); */ } |
1027 816 324 816 816 21 3 2 2 470 29 65 22 7 29 29 37 37 37 570 457 366 366 152 150 152 154 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | /* tnum: tracked (or tristate) numbers * * A tnum tracks knowledge about the bits of a value. Each bit can be either * known (0 or 1), or unknown (x). Arithmetic operations on tnums will * propagate the unknown bits such that the tnum result represents all the * possible results for possible values of the operands. */ #include <linux/kernel.h> #include <linux/tnum.h> #define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m} /* A completely unknown value */ const struct tnum tnum_unknown = { .value = 0, .mask = -1 }; struct tnum tnum_const(u64 value) { return TNUM(value, 0); } struct tnum tnum_range(u64 min, u64 max) { u64 chi = min ^ max, delta; u8 bits = fls64(chi); /* special case, needed because 1ULL << 64 is undefined */ if (bits > 63) return tnum_unknown; /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7. * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return * constant min (since min == max). */ delta = (1ULL << bits) - 1; return TNUM(min & ~delta, delta); } struct tnum tnum_lshift(struct tnum a, u8 shift) { return TNUM(a.value << shift, a.mask << shift); } struct tnum tnum_rshift(struct tnum a, u8 shift) { return TNUM(a.value >> shift, a.mask >> shift); } struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness) { /* if a.value is negative, arithmetic shifting by minimum shift * will have larger negative offset compared to more shifting. * If a.value is nonnegative, arithmetic shifting by minimum shift * will have larger positive offset compare to more shifting. */ if (insn_bitness == 32) return TNUM((u32)(((s32)a.value) >> min_shift), (u32)(((s32)a.mask) >> min_shift)); else return TNUM((s64)a.value >> min_shift, (s64)a.mask >> min_shift); } struct tnum tnum_add(struct tnum a, struct tnum b) { u64 sm, sv, sigma, chi, mu; sm = a.mask + b.mask; sv = a.value + b.value; sigma = sm + sv; chi = sigma ^ sv; mu = chi | a.mask | b.mask; return TNUM(sv & ~mu, mu); } struct tnum tnum_sub(struct tnum a, struct tnum b) { u64 dv, alpha, beta, chi, mu; dv = a.value - b.value; alpha = dv + a.mask; beta = dv - b.mask; chi = alpha ^ beta; mu = chi | a.mask | b.mask; return TNUM(dv & ~mu, mu); } struct tnum tnum_and(struct tnum a, struct tnum b) { u64 alpha, beta, v; alpha = a.value | a.mask; beta = b.value | b.mask; v = a.value & b.value; return TNUM(v, alpha & beta & ~v); } struct tnum tnum_or(struct tnum a, struct tnum b) { u64 v, mu; v = a.value | b.value; mu = a.mask | b.mask; return TNUM(v, mu & ~v); } struct tnum tnum_xor(struct tnum a, struct tnum b) { u64 v, mu; v = a.value ^ b.value; mu = a.mask | b.mask; return TNUM(v & ~mu, mu); } /* half-multiply add: acc += (unknown * mask * value). * An intermediate step in the multiply algorithm. */ static struct tnum hma(struct tnum acc, u64 value, u64 mask) { while (mask) { if (mask & 1) acc = tnum_add(acc, TNUM(0, value)); mask >>= 1; value <<= 1; } return acc; } struct tnum tnum_mul(struct tnum a, struct tnum b) { struct tnum acc; u64 pi; pi = a.value * b.value; acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value); return hma(acc, b.mask, a.value); } /* Note that if a and b disagree - i.e. one has a 'known 1' where the other has * a 'known 0' - this will return a 'known 1' for that bit. */ struct tnum tnum_intersect(struct tnum a, struct tnum b) { u64 v, mu; v = a.value | b.value; mu = a.mask & b.mask; return TNUM(v & ~mu, mu); } struct tnum tnum_cast(struct tnum a, u8 size) { a.value &= (1ULL << (size * 8)) - 1; a.mask &= (1ULL << (size * 8)) - 1; return a; } bool tnum_is_aligned(struct tnum a, u64 size) { if (!size) return true; return !((a.value | a.mask) & (size - 1)); } bool tnum_in(struct tnum a, struct tnum b) { if (b.mask & ~a.mask) return false; b.value &= ~a.mask; return a.value == b.value; } int tnum_strn(char *str, size_t size, struct tnum a) { return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask); } EXPORT_SYMBOL_GPL(tnum_strn); int tnum_sbin(char *str, size_t size, struct tnum a) { size_t n; for (n = 64; n; n--) { if (n < size) { if (a.mask & 1) str[n - 1] = 'x'; else if (a.value & 1) str[n - 1] = '1'; else str[n - 1] = '0'; } a.mask >>= 1; a.value >>= 1; } str[min(size - 1, (size_t)64)] = 0; return 64; } |
3 3 3 3 3 3 3 3 3 3 2 3 3 41 2 5 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright * details */ #include <linux/time.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> #include "reiserfs.h" /* * access to tail : when one is going to read tail it must make sure, that is * not running. direct2indirect and indirect2direct can not run concurrently */ /* * Converts direct items to an unformatted node. Panics if file has no * tail. -ENOSPC if no disk space for conversion */ /* * path points to first direct item of the file regardless of how many of * them are there */ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, struct buffer_head *unbh, loff_t tail_offset) { struct super_block *sb = inode->i_sb; struct buffer_head *up_to_date_bh; struct item_head *p_le_ih = tp_item_head(path); unsigned long total_tail = 0; /* Key to search for the last byte of the converted item. */ struct cpu_key end_key; /* * new indirect item to be inserted or key * of unfm pointer to be pasted */ struct item_head ind_ih; int blk_size; /* returned value for reiserfs_insert_item and clones */ int retval; /* Handle on an unformatted node that will be inserted in the tree. */ unp_t unfm_ptr; BUG_ON(!th->t_trans_id); REISERFS_SB(sb)->s_direct2indirect++; blk_size = sb->s_blocksize; /* * and key to search for append or insert pointer to the new * unformatted node. */ copy_item_head(&ind_ih, p_le_ih); set_le_ih_k_offset(&ind_ih, tail_offset); set_le_ih_k_type(&ind_ih, TYPE_INDIRECT); /* Set the key to search for the place for new unfm pointer */ make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4); /* FIXME: we could avoid this */ if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) { reiserfs_error(sb, "PAP-14030", "pasted or inserted byte exists in " "the tree %K. Use fsck to repair.", &end_key); pathrelse(path); return -EIO; } p_le_ih = tp_item_head(path); unfm_ptr = cpu_to_le32(unbh->b_blocknr); if (is_statdata_le_ih(p_le_ih)) { /* Insert new indirect item. */ set_ih_free_space(&ind_ih, 0); /* delete at nearest future */ put_ih_item_len(&ind_ih, UNFM_P_SIZE); PATH_LAST_POSITION(path)++; retval = reiserfs_insert_item(th, path, &end_key, &ind_ih, inode, (char *)&unfm_ptr); } else { /* Paste into last indirect item of an object. */ retval = reiserfs_paste_into_item(th, path, &end_key, inode, (char *)&unfm_ptr, UNFM_P_SIZE); } if (retval) { return retval; } /* * note: from here there are two keys which have matching first * three key components. They only differ by the fourth one. */ /* Set the key to search for the direct items of the file */ make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT, 4); /* * Move bytes from the direct items to the new unformatted node * and delete them. */ while (1) { int tail_size; /* * end_key.k_offset is set so, that we will always have found * last item of the file */ if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) reiserfs_panic(sb, "PAP-14050", "direct item (%K) not found", &end_key); p_le_ih = tp_item_head(path); RFALSE(!is_direct_le_ih(p_le_ih), "vs-14055: direct item expected(%K), found %h", &end_key, p_le_ih); tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1)) + ih_item_len(p_le_ih) - 1; /* * we only send the unbh pointer if the buffer is not * up to date. this avoids overwriting good data from * writepage() with old data from the disk or buffer cache * Special case: unbh->b_page will be NULL if we are coming * through DIRECT_IO handler here. */ if (!unbh->b_page || buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) { up_to_date_bh = NULL; } else { up_to_date_bh = unbh; } retval = reiserfs_delete_item(th, path, &end_key, inode, up_to_date_bh); total_tail += retval; /* done: file does not have direct items anymore */ if (tail_size == retval) break; } /* * if we've copied bytes from disk into the page, we need to zero * out the unused part of the block (it was not up to date before) */ if (up_to_date_bh) { unsigned pgoff = (tail_offset + total_tail - 1) & (PAGE_SIZE - 1); char *kaddr = kmap_atomic(up_to_date_bh->b_page); memset(kaddr + pgoff, 0, blk_size - total_tail); kunmap_atomic(kaddr); } REISERFS_I(inode)->i_first_direct_byte = U32_MAX; return 0; } /* stolen from fs/buffer.c */ void reiserfs_unmap_buffer(struct buffer_head *bh) { lock_buffer(bh); if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { BUG(); } clear_buffer_dirty(bh); /* * Remove the buffer from whatever list it belongs to. We are mostly * interested in removing it from per-sb j_dirty_buffers list, to avoid * BUG() on attempt to write not mapped buffer */ if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { struct inode *inode = bh->b_page->mapping->host; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); spin_lock(&j->j_dirty_buffers_lock); list_del_init(&bh->b_assoc_buffers); reiserfs_free_jh(bh); spin_unlock(&j->j_dirty_buffers_lock); } clear_buffer_mapped(bh); clear_buffer_req(bh); clear_buffer_new(bh); bh->b_bdev = NULL; unlock_buffer(bh); } /* * this first locks inode (neither reads nor sync are permitted), * reads tail through page cache, insert direct item. When direct item * inserted successfully inode is left locked. Return value is always * what we expect from it (number of cut bytes). But when tail remains * in the unformatted node, we set mode to SKIP_BALANCING and unlock * inode */ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *inode, struct page *page, struct treepath *path, /* path to the indirect item. */ const struct cpu_key *item_key, /* Key to look for * unformatted node * pointer to be cut. */ loff_t n_new_file_size, /* New file size. */ char *mode) { struct super_block *sb = inode->i_sb; struct item_head s_ih; unsigned long block_size = sb->s_blocksize; char *tail; int tail_len, round_tail_len; loff_t pos, pos1; /* position of first byte of the tail */ struct cpu_key key; BUG_ON(!th->t_trans_id); REISERFS_SB(sb)->s_indirect2direct++; *mode = M_SKIP_BALANCING; /* store item head path points to. */ copy_item_head(&s_ih, tp_item_head(path)); tail_len = (n_new_file_size & (block_size - 1)); if (get_inode_sd_version(inode) == STAT_DATA_V2) round_tail_len = ROUND_UP(tail_len); else round_tail_len = tail_len; pos = le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - 1) * sb->s_blocksize; pos1 = pos; /* * we are protected by i_mutex. The tail can not disapper, not * append can be done either * we are in truncate or packing tail in file_release */ tail = (char *)kmap(page); /* this can schedule */ if (path_changed(&s_ih, path)) { /* re-search indirect item */ if (search_for_position_by_key(sb, item_key, path) == POSITION_NOT_FOUND) reiserfs_panic(sb, "PAP-5520", "item to be converted %K does not exist", item_key); copy_item_head(&s_ih, tp_item_head(path)); #ifdef CONFIG_REISERFS_CHECK pos = le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - 1) * sb->s_blocksize; if (pos != pos1) reiserfs_panic(sb, "vs-5530", "tail position " "changed while we were reading it"); #endif } /* Set direct item header to insert. */ make_le_item_head(&s_ih, NULL, get_inode_item_key_version(inode), pos1 + 1, TYPE_DIRECT, round_tail_len, 0xffff /*ih_free_space */ ); /* * we want a pointer to the first byte of the tail in the page. * the page was locked and this part of the page was up to date when * indirect2direct was called, so we know the bytes are still valid */ tail = tail + (pos & (PAGE_SIZE - 1)); PATH_LAST_POSITION(path)++; key = *item_key; set_cpu_key_k_type(&key, TYPE_DIRECT); key.key_length = 4; /* Insert tail as new direct item in the tree */ if (reiserfs_insert_item(th, path, &key, &s_ih, inode, tail ? tail : NULL) < 0) { /* * No disk memory. So we can not convert last unformatted node * to the direct item. In this case we used to adjust * indirect items's ih_free_space. Now ih_free_space is not * used, it would be ideal to write zeros to corresponding * unformatted node. For now i_size is considered as guard for * going out of file size */ kunmap(page); return block_size - round_tail_len; } kunmap(page); /* make sure to get the i_blocks changes from reiserfs_insert_item */ reiserfs_update_sd(th, inode); /* * note: we have now the same as in above direct2indirect * conversion: there are two keys which have matching first three * key components. They only differ by the fourth one. */ /* * We have inserted new direct item and must remove last * unformatted node. */ *mode = M_CUT; /* we store position of first direct item in the in-core inode */ /* mark_file_with_tail (inode, pos1 + 1); */ REISERFS_I(inode)->i_first_direct_byte = pos1 + 1; return block_size - round_tail_len; } |
40 39 37 36 36 20 14 17 16 15 19 18 14 8 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 | /* Copyright (c) 2017 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ #include <linux/slab.h> #include <linux/bpf.h> #include "map_in_map.h" struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) { struct bpf_map *inner_map, *inner_map_meta; u32 inner_map_meta_size; struct fd f; f = fdget(inner_map_ufd); inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; /* prog_array->owner_prog_type and owner_jited * is a runtime binding. Doing static check alone * in the verifier is not enough. */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE) { fdput(f); return ERR_PTR(-ENOTSUPP); } /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) { fdput(f); return ERR_PTR(-EINVAL); } inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ if (inner_map->ops == &array_map_ops) inner_map_meta_size = sizeof(struct bpf_array); inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) { fdput(f); return ERR_PTR(-ENOMEM); } inner_map_meta->map_type = inner_map->map_type; inner_map_meta->key_size = inner_map->key_size; inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { inner_map_meta->unpriv_array = inner_map->unpriv_array; container_of(inner_map_meta, struct bpf_array, map)->index_mask = container_of(inner_map, struct bpf_array, map)->index_mask; } fdput(f); return inner_map_meta; } void bpf_map_meta_free(struct bpf_map *map_meta) { kfree(map_meta); } bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { /* No need to compare ops because it is covered by map_type */ return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && meta0->map_flags == meta1->map_flags && meta0->max_entries == meta1->max_entries; } void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int ufd) { struct bpf_map *inner_map; struct fd f; f = fdget(ufd); inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) inner_map = bpf_map_inc(inner_map, false); else inner_map = ERR_PTR(-EINVAL); fdput(f); return inner_map; } void bpf_map_fd_put_ptr(void *ptr) { /* ptr->ops->map_free() has to go through one * rcu grace period by itself. */ bpf_map_put(ptr); } u32 bpf_map_fd_sys_lookup_elem(void *ptr) { return ((struct bpf_map *)ptr)->id; } |
1664 1669 1668 1671 207 207 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | /* SPDX-License-Identifier: GPL-2.0 */ /* * connection tracking event cache. */ #ifndef _NF_CONNTRACK_ECACHE_H #define _NF_CONNTRACK_ECACHE_H #include <net/netfilter/nf_conntrack.h> #include <net/net_namespace.h> #include <net/netfilter/nf_conntrack_expect.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <net/netfilter/nf_conntrack_extend.h> enum nf_ct_ecache_state { NFCT_ECACHE_UNKNOWN, /* destroy event not sent */ NFCT_ECACHE_DESTROY_FAIL, /* tried but failed to send destroy event */ NFCT_ECACHE_DESTROY_SENT, /* sent destroy event after failure */ }; struct nf_conntrack_ecache { unsigned long cache; /* bitops want long */ u16 missed; /* missed events */ u16 ctmask; /* bitmask of ct events to be delivered */ u16 expmask; /* bitmask of expect events to be delivered */ enum nf_ct_ecache_state state:8;/* ecache state */ u32 portid; /* netlink portid of destroyer */ }; static inline struct nf_conntrack_ecache * nf_ct_ecache_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_EVENTS return nf_ct_ext_find(ct, NF_CT_EXT_ECACHE); #else return NULL; #endif } static inline struct nf_conntrack_ecache * nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { #ifdef CONFIG_NF_CONNTRACK_EVENTS struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *e; if (!ctmask && !expmask && net->ct.sysctl_events) { ctmask = ~0; expmask = ~0; } if (!ctmask && !expmask) return NULL; e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); if (e) { e->ctmask = ctmask; e->expmask = expmask; } return e; #else return NULL; #endif }; #ifdef CONFIG_NF_CONNTRACK_EVENTS /* This structure is passed to event handler */ struct nf_ct_event { struct nf_conn *ct; u32 portid; int report; }; struct nf_ct_event_notifier { int (*fcn)(unsigned int events, struct nf_ct_event *item); }; int nf_conntrack_register_notifier(struct net *net, struct nf_ct_event_notifier *nb); void nf_conntrack_unregister_notifier(struct net *net, struct nf_ct_event_notifier *nb); void nf_ct_deliver_cached_events(struct nf_conn *ct); int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, u32 portid, int report); static inline void nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) { struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *e; if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) return; e = nf_ct_ecache_find(ct); if (e == NULL) return; set_bit(event, &e->cache); } static inline int nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct, u32 portid, int report) { const struct net *net = nf_ct_net(ct); if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) return 0; return nf_conntrack_eventmask_report(1 << event, ct, portid, report); } static inline int nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct) { const struct net *net = nf_ct_net(ct); if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) return 0; return nf_conntrack_eventmask_report(1 << event, ct, 0, 0); } struct nf_exp_event { struct nf_conntrack_expect *exp; u32 portid; int report; }; struct nf_exp_event_notifier { int (*fcn)(unsigned int events, struct nf_exp_event *item); }; int nf_ct_expect_register_notifier(struct net *net, struct nf_exp_event_notifier *nb); void nf_ct_expect_unregister_notifier(struct net *net, struct nf_exp_event_notifier *nb); void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, struct nf_conntrack_expect *exp, u32 portid, int report); int nf_conntrack_ecache_pernet_init(struct net *net); void nf_conntrack_ecache_pernet_fini(struct net *net); int nf_conntrack_ecache_init(void); void nf_conntrack_ecache_fini(void); static inline void nf_conntrack_ecache_delayed_work(struct net *net) { if (!delayed_work_pending(&net->ct.ecache_dwork)) { schedule_delayed_work(&net->ct.ecache_dwork, HZ); net->ct.ecache_dwork_pending = true; } } static inline void nf_conntrack_ecache_work(struct net *net) { if (net->ct.ecache_dwork_pending) { net->ct.ecache_dwork_pending = false; mod_delayed_work(system_wq, &net->ct.ecache_dwork, 0); } } #else /* CONFIG_NF_CONNTRACK_EVENTS */ static inline void nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) {} static inline int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, u32 portid, int report) { return 0; } static inline int nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct) { return 0; } static inline int nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct, u32 portid, int report) { return 0; } static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct) {} static inline void nf_ct_expect_event_report(enum ip_conntrack_expect_events e, struct nf_conntrack_expect *exp, u32 portid, int report) {} static inline int nf_conntrack_ecache_pernet_init(struct net *net) { return 0; } static inline void nf_conntrack_ecache_pernet_fini(struct net *net) { } static inline int nf_conntrack_ecache_init(void) { return 0; } static inline void nf_conntrack_ecache_fini(void) { } static inline void nf_conntrack_ecache_delayed_work(struct net *net) { } static inline void nf_conntrack_ecache_work(struct net *net) { } #endif /* CONFIG_NF_CONNTRACK_EVENTS */ #endif /*_NF_CONNTRACK_ECACHE_H*/ |
12 12 1 1 1 11 10 10 10 10 10 10 10 7 10 11 1 1 11 11 12 10 10 10 10 10 10 10 10 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 | /* Copyright (C) 2013 Cisco Systems, Inc, 2013. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * Author: Vijay Subramanian <vijaynsu@cisco.com> * Author: Mythili Prabhu <mysuryan@cisco.com> * * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no> * University of Oslo, Norway. * * References: * IETF draft submission: http://tools.ietf.org/html/draft-pan-aqm-pie-00 * IEEE Conference on High Performance Switching and Routing 2013 : * "PIE: A * Lightweight Control Scheme to Address the Bufferbloat Problem" */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> #define QUEUE_THRESHOLD 10000 #define DQCOUNT_INVALID -1 #define MAX_PROB 0xffffffff #define PIE_SCALE 8 /* parameters used */ struct pie_params { psched_time_t target; /* user specified target delay in pschedtime */ u32 tupdate; /* timer frequency (in jiffies) */ u32 limit; /* number of packets that can be enqueued */ u32 alpha; /* alpha and beta are between 0 and 32 */ u32 beta; /* and are used for shift relative to 1 */ bool ecn; /* true if ecn is enabled */ bool bytemode; /* to scale drop early prob based on pkt size */ }; /* variables used */ struct pie_vars { u32 prob; /* probability but scaled by u32 limit. */ psched_time_t burst_time; psched_time_t qdelay; psched_time_t qdelay_old; u64 dq_count; /* measured in bytes */ psched_time_t dq_tstamp; /* drain rate */ u32 avg_dq_rate; /* bytes per pschedtime tick,scaled */ u32 qlen_old; /* in bytes */ }; /* statistics gathering */ struct pie_stats { u32 packets_in; /* total number of packets enqueued */ u32 dropped; /* packets dropped due to pie_action */ u32 overlimit; /* dropped due to lack of space in queue */ u32 maxq; /* maximum queue size */ u32 ecn_mark; /* packets marked with ECN */ }; /* private data for the Qdisc */ struct pie_sched_data { struct pie_params params; struct pie_vars vars; struct pie_stats stats; struct timer_list adapt_timer; struct Qdisc *sch; }; static void pie_params_init(struct pie_params *params) { params->alpha = 2; params->beta = 20; params->tupdate = usecs_to_jiffies(30 * USEC_PER_MSEC); /* 30 ms */ params->limit = 1000; /* default of 1000 packets */ params->target = PSCHED_NS2TICKS(20 * NSEC_PER_MSEC); /* 20 ms */ params->ecn = false; params->bytemode = false; } static void pie_vars_init(struct pie_vars *vars) { vars->dq_count = DQCOUNT_INVALID; vars->avg_dq_rate = 0; /* default of 100 ms in pschedtime */ vars->burst_time = PSCHED_NS2TICKS(100 * NSEC_PER_MSEC); } static bool drop_early(struct Qdisc *sch, u32 packet_size) { struct pie_sched_data *q = qdisc_priv(sch); u32 rnd; u32 local_prob = q->vars.prob; u32 mtu = psched_mtu(qdisc_dev(sch)); /* If there is still burst allowance left skip random early drop */ if (q->vars.burst_time > 0) return false; /* If current delay is less than half of target, and * if drop prob is low already, disable early_drop */ if ((q->vars.qdelay < q->params.target / 2) && (q->vars.prob < MAX_PROB / 5)) return false; /* If we have fewer than 2 mtu-sized packets, disable drop_early, * similar to min_th in RED */ if (sch->qstats.backlog < 2 * mtu) return false; /* If bytemode is turned on, use packet size to compute new * probablity. Smaller packets will have lower drop prob in this case */ if (q->params.bytemode && packet_size <= mtu) local_prob = (local_prob / mtu) * packet_size; else local_prob = q->vars.prob; rnd = prandom_u32(); if (rnd < local_prob) return true; return false; } static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct pie_sched_data *q = qdisc_priv(sch); bool enqueue = false; if (unlikely(qdisc_qlen(sch) >= sch->limit)) { q->stats.overlimit++; goto out; } if (!drop_early(sch, skb->len)) { enqueue = true; } else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) && INET_ECN_set_ce(skb)) { /* If packet is ecn capable, mark it if drop probability * is lower than 10%, else drop it. */ q->stats.ecn_mark++; enqueue = true; } /* we can enqueue the packet */ if (enqueue) { q->stats.packets_in++; if (qdisc_qlen(sch) > q->stats.maxq) q->stats.maxq = qdisc_qlen(sch); return qdisc_enqueue_tail(skb, sch); } out: q->stats.dropped++; return qdisc_drop(skb, sch, to_free); } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { [TCA_PIE_TARGET] = {.type = NLA_U32}, [TCA_PIE_LIMIT] = {.type = NLA_U32}, [TCA_PIE_TUPDATE] = {.type = NLA_U32}, [TCA_PIE_ALPHA] = {.type = NLA_U32}, [TCA_PIE_BETA] = {.type = NLA_U32}, [TCA_PIE_ECN] = {.type = NLA_U32}, [TCA_PIE_BYTEMODE] = {.type = NLA_U32}, }; static int pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_PIE_MAX + 1]; unsigned int qlen, dropped = 0; int err; if (!opt) return -EINVAL; err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy, NULL); if (err < 0) return err; sch_tree_lock(sch); /* convert from microseconds to pschedtime */ if (tb[TCA_PIE_TARGET]) { /* target is in us */ u32 target = nla_get_u32(tb[TCA_PIE_TARGET]); /* convert to pschedtime */ q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC); } /* tupdate is in jiffies */ if (tb[TCA_PIE_TUPDATE]) q->params.tupdate = usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE])); if (tb[TCA_PIE_LIMIT]) { u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]); q->params.limit = limit; sch->limit = limit; } if (tb[TCA_PIE_ALPHA]) q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]); if (tb[TCA_PIE_BETA]) q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]); if (tb[TCA_PIE_ECN]) q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]); if (tb[TCA_PIE_BYTEMODE]) q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]); /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); rtnl_qdisc_drop(skb, sch); } qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); sch_tree_unlock(sch); return 0; } static void pie_process_dequeue(struct Qdisc *sch, struct sk_buff *skb) { struct pie_sched_data *q = qdisc_priv(sch); int qlen = sch->qstats.backlog; /* current queue size in bytes */ /* If current queue is about 10 packets or more and dq_count is unset * we have enough packets to calculate the drain rate. Save * current time as dq_tstamp and start measurement cycle. */ if (qlen >= QUEUE_THRESHOLD && q->vars.dq_count == DQCOUNT_INVALID) { q->vars.dq_tstamp = psched_get_time(); q->vars.dq_count = 0; } /* Calculate the average drain rate from this value. If queue length * has receded to a small value viz., <= QUEUE_THRESHOLD bytes,reset * the dq_count to -1 as we don't have enough packets to calculate the * drain rate anymore The following if block is entered only when we * have a substantial queue built up (QUEUE_THRESHOLD bytes or more) * and we calculate the drain rate for the threshold here. dq_count is * in bytes, time difference in psched_time, hence rate is in * bytes/psched_time. */ if (q->vars.dq_count != DQCOUNT_INVALID) { q->vars.dq_count += skb->len; if (q->vars.dq_count >= QUEUE_THRESHOLD) { psched_time_t now = psched_get_time(); u32 dtime = now - q->vars.dq_tstamp; u32 count = q->vars.dq_count << PIE_SCALE; if (dtime == 0) return; count = count / dtime; if (q->vars.avg_dq_rate == 0) q->vars.avg_dq_rate = count; else q->vars.avg_dq_rate = (q->vars.avg_dq_rate - (q->vars.avg_dq_rate >> 3)) + (count >> 3); /* If the queue has receded below the threshold, we hold * on to the last drain rate calculated, else we reset * dq_count to 0 to re-enter the if block when the next * packet is dequeued */ if (qlen < QUEUE_THRESHOLD) q->vars.dq_count = DQCOUNT_INVALID; else { q->vars.dq_count = 0; q->vars.dq_tstamp = psched_get_time(); } if (q->vars.burst_time > 0) { if (q->vars.burst_time > dtime) q->vars.burst_time -= dtime; else q->vars.burst_time = 0; } } } } static void calculate_probability(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); u32 qlen = sch->qstats.backlog; /* queue size in bytes */ psched_time_t qdelay = 0; /* in pschedtime */ psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ s32 delta = 0; /* determines the change in probability */ u32 oldprob; u32 alpha, beta; bool update_prob = true; q->vars.qdelay_old = q->vars.qdelay; if (q->vars.avg_dq_rate > 0) qdelay = (qlen << PIE_SCALE) / q->vars.avg_dq_rate; else qdelay = 0; /* If qdelay is zero and qlen is not, it means qlen is very small, less * than dequeue_rate, so we do not update probabilty in this round */ if (qdelay == 0 && qlen != 0) update_prob = false; /* In the algorithm, alpha and beta are between 0 and 2 with typical * value for alpha as 0.125. In this implementation, we use values 0-32 * passed from user space to represent this. Also, alpha and beta have * unit of HZ and need to be scaled before they can used to update * probability. alpha/beta are updated locally below by 1) scaling them * appropriately 2) scaling down by 16 to come to 0-2 range. * Please see paper for details. * * We scale alpha and beta differently depending on whether we are in * light, medium or high dropping mode. */ if (q->vars.prob < MAX_PROB / 100) { alpha = (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; beta = (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; } else if (q->vars.prob < MAX_PROB / 10) { alpha = (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; beta = (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; } else { alpha = (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; beta = (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; } /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ delta += alpha * ((qdelay - q->params.target)); delta += beta * ((qdelay - qdelay_old)); oldprob = q->vars.prob; /* to ensure we increase probability in steps of no more than 2% */ if (delta > (s32) (MAX_PROB / (100 / 2)) && q->vars.prob >= MAX_PROB / 10) delta = (MAX_PROB / 100) * 2; /* Non-linear drop: * Tune drop probability to increase quickly for high delays(>= 250ms) * 250ms is derived through experiments and provides error protection */ if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC))) delta += MAX_PROB / (100 / 2); q->vars.prob += delta; if (delta > 0) { /* prevent overflow */ if (q->vars.prob < oldprob) { q->vars.prob = MAX_PROB; /* Prevent normalization error. If probability is at * maximum value already, we normalize it here, and * skip the check to do a non-linear drop in the next * section. */ update_prob = false; } } else { /* prevent underflow */ if (q->vars.prob > oldprob) q->vars.prob = 0; } /* Non-linear drop in probability: Reduce drop probability quickly if * delay is 0 for 2 consecutive Tupdate periods. */ if ((qdelay == 0) && (qdelay_old == 0) && update_prob) q->vars.prob = (q->vars.prob * 98) / 100; q->vars.qdelay = qdelay; q->vars.qlen_old = qlen; /* We restart the measurement cycle if the following conditions are met * 1. If the delay has been low for 2 consecutive Tupdate periods * 2. Calculated drop probability is zero * 3. We have atleast one estimate for the avg_dq_rate ie., * is a non-zero value */ if ((q->vars.qdelay < q->params.target / 2) && (q->vars.qdelay_old < q->params.target / 2) && (q->vars.prob == 0) && (q->vars.avg_dq_rate > 0)) pie_vars_init(&q->vars); } static void pie_timer(struct timer_list *t) { struct pie_sched_data *q = from_timer(q, t, adapt_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); calculate_probability(sch); /* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */ if (q->params.tupdate) mod_timer(&q->adapt_timer, jiffies + q->params.tupdate); spin_unlock(root_lock); } static int pie_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct pie_sched_data *q = qdisc_priv(sch); pie_params_init(&q->params); pie_vars_init(&q->vars); sch->limit = q->params.limit; q->sch = sch; timer_setup(&q->adapt_timer, pie_timer, 0); if (opt) { int err = pie_change(sch, opt, extack); if (err) return err; } mod_timer(&q->adapt_timer, jiffies + HZ / 2); return 0; } static int pie_dump(struct Qdisc *sch, struct sk_buff *skb) { struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *opts; opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; /* convert target from pschedtime to us */ if (nla_put_u32(skb, TCA_PIE_TARGET, ((u32) PSCHED_TICKS2NS(q->params.target)) / NSEC_PER_USEC) || nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) || nla_put_u32(skb, TCA_PIE_TUPDATE, jiffies_to_usecs(q->params.tupdate)) || nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) || nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) || nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) || nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode)) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: nla_nest_cancel(skb, opts); return -1; } static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct pie_sched_data *q = qdisc_priv(sch); struct tc_pie_xstats st = { .prob = q->vars.prob, .delay = ((u32) PSCHED_TICKS2NS(q->vars.qdelay)) / NSEC_PER_USEC, /* unscale and return dq_rate in bytes per sec */ .avg_dq_rate = q->vars.avg_dq_rate * (PSCHED_TICKS_PER_SEC) >> PIE_SCALE, .packets_in = q->stats.packets_in, .overlimit = q->stats.overlimit, .maxq = q->stats.maxq, .dropped = q->stats.dropped, .ecn_mark = q->stats.ecn_mark, }; return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch) { struct sk_buff *skb; skb = qdisc_dequeue_head(sch); if (!skb) return NULL; pie_process_dequeue(sch, skb); return skb; } static void pie_reset(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); qdisc_reset_queue(sch); pie_vars_init(&q->vars); } static void pie_destroy(struct Qdisc *sch) { struct pie_sched_data *q = qdisc_priv(sch); q->params.tupdate = 0; del_timer_sync(&q->adapt_timer); } static struct Qdisc_ops pie_qdisc_ops __read_mostly = { .id = "pie", .priv_size = sizeof(struct pie_sched_data), .enqueue = pie_qdisc_enqueue, .dequeue = pie_qdisc_dequeue, .peek = qdisc_peek_dequeued, .init = pie_init, .destroy = pie_destroy, .reset = pie_reset, .change = pie_change, .dump = pie_dump, .dump_stats = pie_dump_stats, .owner = THIS_MODULE, }; static int __init pie_module_init(void) { return register_qdisc(&pie_qdisc_ops); } static void __exit pie_module_exit(void) { unregister_qdisc(&pie_qdisc_ops); } module_init(pie_module_init); module_exit(pie_module_exit); MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler"); MODULE_AUTHOR("Vijay Subramanian"); MODULE_AUTHOR("Mythili Prabhu"); MODULE_LICENSE("GPL"); |
86 86 86 86 86 86 86 86 86 86 58 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar * * This file contains the /proc/irq/ handling code. */ #include <linux/irq.h> #include <linux/gfp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/mutex.h> #include "internals.h" /* * Access rules: * * procfs protects read/write of /proc/irq/N/ files against a * concurrent free of the interrupt descriptor. remove_proc_entry() * immediately prevents new read/writes to happen and waits for * already running read/write functions to complete. * * We remove the proc entries first and then delete the interrupt * descriptor from the radix tree and free it. So it is guaranteed * that irq_to_desc(N) is valid as long as the read/writes are * permitted by procfs. * * The read from /proc/interrupts is a different problem because there * is no protection. So the lookup and the access to irqdesc * information must be protected by sparse_irq_lock. */ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP enum { AFFINITY, AFFINITY_LIST, EFFECTIVE, EFFECTIVE_LIST, }; static int show_irq_affinity(int type, struct seq_file *m) { struct irq_desc *desc = irq_to_desc((long)m->private); const struct cpumask *mask; switch (type) { case AFFINITY: case AFFINITY_LIST: mask = desc->irq_common_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (irqd_is_setaffinity_pending(&desc->irq_data)) mask = desc->pending_mask; #endif break; case EFFECTIVE: case EFFECTIVE_LIST: #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK mask = irq_data_get_effective_affinity_mask(&desc->irq_data); break; #endif default: return -EINVAL; } switch (type) { case AFFINITY_LIST: case EFFECTIVE_LIST: seq_printf(m, "%*pbl\n", cpumask_pr_args(mask)); break; case AFFINITY: case EFFECTIVE: seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); break; } return 0; } static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); unsigned long flags; cpumask_var_t mask; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; raw_spin_lock_irqsave(&desc->lock, flags); if (desc->affinity_hint) cpumask_copy(mask, desc->affinity_hint); raw_spin_unlock_irqrestore(&desc->lock, flags); seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); return 0; } #ifndef is_affinity_mask_valid #define is_affinity_mask_valid(val) 1 #endif int no_irq_affinity; static int irq_affinity_proc_show(struct seq_file *m, void *v) { return show_irq_affinity(AFFINITY, m); } static int irq_affinity_list_proc_show(struct seq_file *m, void *v) { return show_irq_affinity(AFFINITY_LIST, m); } #ifndef CONFIG_AUTO_IRQ_AFFINITY static inline int irq_select_affinity_usr(unsigned int irq) { /* * If the interrupt is started up already then this fails. The * interrupt is assigned to an online CPU already. There is no * point to move it around randomly. Tell user space that the * selected mask is bogus. * * If not then any change to the affinity is pointless because the * startup code invokes irq_setup_affinity() which will select * a online CPU anyway. */ return -EINVAL; } #else /* ALPHA magic affinity auto selector. Keep it for historical reasons. */ static inline int irq_select_affinity_usr(unsigned int irq) { return irq_select_affinity(irq); } #endif static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { unsigned int irq = (int)(long)PDE_DATA(file_inode(file)); cpumask_var_t new_value; int err; if (!irq_can_set_affinity_usr(irq) || no_irq_affinity) return -EIO; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; if (type) err = cpumask_parselist_user(buffer, count, new_value); else err = cpumask_parse_user(buffer, count, new_value); if (err) goto free_cpumask; if (!is_affinity_mask_valid(new_value)) { err = -EINVAL; goto free_cpumask; } /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ if (!cpumask_intersects(new_value, cpu_online_mask)) { /* * Special case for empty set - allow the architecture code * to set default SMP affinity. */ err = irq_select_affinity_usr(irq) ? -EINVAL : count; } else { err = irq_set_affinity(irq, new_value); if (!err) err = count; } free_cpumask: free_cpumask_var(new_value); return err; } static ssize_t irq_affinity_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { return write_irq_affinity(0, file, buffer, count, pos); } static ssize_t irq_affinity_list_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { return write_irq_affinity(1, file, buffer, count, pos); } static int irq_affinity_proc_open(struct inode *inode, struct file *file) { return single_open(file, irq_affinity_proc_show, PDE_DATA(inode)); } static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) { return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); } static const struct file_operations irq_affinity_proc_fops = { .open = irq_affinity_proc_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, .write = irq_affinity_proc_write, }; static const struct file_operations irq_affinity_list_proc_fops = { .open = irq_affinity_list_proc_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, .write = irq_affinity_list_proc_write, }; #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK static int irq_effective_aff_proc_show(struct seq_file *m, void *v) { return show_irq_affinity(EFFECTIVE, m); } static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v) { return show_irq_affinity(EFFECTIVE_LIST, m); } #endif static int default_affinity_show(struct seq_file *m, void *v) { seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity)); return 0; } static ssize_t default_affinity_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { cpumask_var_t new_value; int err; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; err = cpumask_parse_user(buffer, count, new_value); if (err) goto out; if (!is_affinity_mask_valid(new_value)) { err = -EINVAL; goto out; } /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ if (!cpumask_intersects(new_value, cpu_online_mask)) { err = -EINVAL; goto out; } cpumask_copy(irq_default_affinity, new_value); err = count; out: free_cpumask_var(new_value); return err; } static int default_affinity_open(struct inode *inode, struct file *file) { return single_open(file, default_affinity_show, PDE_DATA(inode)); } static const struct file_operations default_affinity_proc_fops = { .open = default_affinity_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, .write = default_affinity_write, }; static int irq_node_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long) m->private); seq_printf(m, "%d\n", irq_desc_get_node(desc)); return 0; } #endif static int irq_spurious_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long) m->private); seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n", desc->irq_count, desc->irqs_unhandled, jiffies_to_msecs(desc->last_unhandled)); return 0; } #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned long flags; int ret = 1; raw_spin_lock_irqsave(&desc->lock, flags); for_each_action_of_desc(desc, action) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) { ret = 0; break; } } raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } void register_handler_proc(unsigned int irq, struct irqaction *action) { char name [MAX_NAMELEN]; struct irq_desc *desc = irq_to_desc(irq); if (!desc->dir || action->dir || !action->name || !name_unique(irq, action)) return; snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ action->dir = proc_mkdir(name, desc->dir); } #undef MAX_NAMELEN #define MAX_NAMELEN 10 void register_irq_proc(unsigned int irq, struct irq_desc *desc) { static DEFINE_MUTEX(register_lock); void __maybe_unused *irqp = (void *)(unsigned long) irq; char name [MAX_NAMELEN]; if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) return; /* * irq directories are registered only when a handler is * added, not when the descriptor is created, so multiple * tasks might try to register at the same time. */ mutex_lock(®ister_lock); if (desc->dir) goto out_unlock; sprintf(name, "%d", irq); /* create /proc/irq/1234 */ desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) goto out_unlock; #ifdef CONFIG_SMP /* create /proc/irq/<irq>/smp_affinity */ proc_create_data("smp_affinity", 0644, desc->dir, &irq_affinity_proc_fops, irqp); /* create /proc/irq/<irq>/affinity_hint */ proc_create_single_data("affinity_hint", 0444, desc->dir, irq_affinity_hint_proc_show, irqp); /* create /proc/irq/<irq>/smp_affinity_list */ proc_create_data("smp_affinity_list", 0644, desc->dir, &irq_affinity_list_proc_fops, irqp); proc_create_single_data("node", 0444, desc->dir, irq_node_proc_show, irqp); # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK proc_create_single_data("effective_affinity", 0444, desc->dir, irq_effective_aff_proc_show, irqp); proc_create_single_data("effective_affinity_list", 0444, desc->dir, irq_effective_aff_list_proc_show, irqp); # endif #endif proc_create_single_data("spurious", 0444, desc->dir, irq_spurious_proc_show, (void *)(long)irq); out_unlock: mutex_unlock(®ister_lock); } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; if (!root_irq_dir || !desc->dir) return; #ifdef CONFIG_SMP remove_proc_entry("smp_affinity", desc->dir); remove_proc_entry("affinity_hint", desc->dir); remove_proc_entry("smp_affinity_list", desc->dir); remove_proc_entry("node", desc->dir); # ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK remove_proc_entry("effective_affinity", desc->dir); remove_proc_entry("effective_affinity_list", desc->dir); # endif #endif remove_proc_entry("spurious", desc->dir); sprintf(name, "%u", irq); remove_proc_entry(name, root_irq_dir); } #undef MAX_NAMELEN void unregister_handler_proc(unsigned int irq, struct irqaction *action) { proc_remove(action->dir); } static void register_default_affinity_proc(void) { #ifdef CONFIG_SMP proc_create("irq/default_smp_affinity", 0644, NULL, &default_affinity_proc_fops); #endif } void init_irq_proc(void) { unsigned int irq; struct irq_desc *desc; /* create /proc/irq */ root_irq_dir = proc_mkdir("irq", NULL); if (!root_irq_dir) return; register_default_affinity_proc(); /* * Create entries for all existing IRQs. */ for_each_irq_desc(irq, desc) register_irq_proc(irq, desc); } #ifdef CONFIG_GENERIC_IRQ_SHOW int __weak arch_show_interrupts(struct seq_file *p, int prec) { return 0; } #ifndef ACTUAL_NR_IRQS # define ACTUAL_NR_IRQS nr_irqs #endif int show_interrupts(struct seq_file *p, void *v) { static int prec; unsigned long flags, any_count = 0; int i = *(loff_t *) v, j; struct irqaction *action; struct irq_desc *desc; if (i > ACTUAL_NR_IRQS) return 0; if (i == ACTUAL_NR_IRQS) return arch_show_interrupts(p, prec); /* print header and calculate the width of the first column */ if (i == 0) { for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) j *= 10; seq_printf(p, "%*s", prec + 8, ""); for_each_online_cpu(j) seq_printf(p, "CPU%-8d", j); seq_putc(p, '\n'); } rcu_read_lock(); desc = irq_to_desc(i); if (!desc) goto outsparse; if (desc->kstat_irqs) for_each_online_cpu(j) any_count |= *per_cpu_ptr(desc->kstat_irqs, j); if ((!desc->action || irq_desc_is_chained(desc)) && !any_count) goto outsparse; seq_printf(p, "%*d: ", prec, i); for_each_online_cpu(j) seq_printf(p, "%10u ", desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, j) : 0); raw_spin_lock_irqsave(&desc->lock, flags); if (desc->irq_data.chip) { if (desc->irq_data.chip->irq_print_chip) desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); else if (desc->irq_data.chip->name) seq_printf(p, " %8s", desc->irq_data.chip->name); else seq_printf(p, " %8s", "-"); } else { seq_printf(p, " %8s", "None"); } if (desc->irq_data.domain) seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq); else seq_printf(p, " %*s", prec, ""); #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); #endif if (desc->name) seq_printf(p, "-%-8s", desc->name); action = desc->action; if (action) { seq_printf(p, " %s", action->name); while ((action = action->next) != NULL) seq_printf(p, ", %s", action->name); } seq_putc(p, '\n'); raw_spin_unlock_irqrestore(&desc->lock, flags); outsparse: rcu_read_unlock(); return 0; } #endif |
28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | /** * \file drm_dma.c * DMA IOCTL and function support * * \author Rickard E. (Rik) Faith <faith@valinux.com> * \author Gareth Hughes <gareth@valinux.com> */ /* * Created: Fri Mar 19 14:30:16 1999 by faith@valinux.com * * Copyright 1999, 2000 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/export.h> #include <drm/drmP.h> #include "drm_legacy.h" /** * Initialize the DMA data. * * \param dev DRM device. * \return zero on success or a negative value on failure. * * Allocate and initialize a drm_device_dma structure. */ int drm_legacy_dma_setup(struct drm_device *dev) { int i; if (!drm_core_check_feature(dev, DRIVER_HAVE_DMA) || !drm_core_check_feature(dev, DRIVER_LEGACY)) return 0; dev->buf_use = 0; atomic_set(&dev->buf_alloc, 0); dev->dma = kzalloc(sizeof(*dev->dma), GFP_KERNEL); if (!dev->dma) return -ENOMEM; for (i = 0; i <= DRM_MAX_ORDER; i++) memset(&dev->dma->bufs[i], 0, sizeof(dev->dma->bufs[0])); return 0; } /** * Cleanup the DMA resources. * * \param dev DRM device. * * Free all pages associated with DMA buffers, the buffers and pages lists, and * finally the drm_device::dma structure itself. */ void drm_legacy_dma_takedown(struct drm_device *dev) { struct drm_device_dma *dma = dev->dma; int i, j; if (!drm_core_check_feature(dev, DRIVER_HAVE_DMA) || !drm_core_check_feature(dev, DRIVER_LEGACY)) return; if (!dma) return; /* Clear dma buffers */ for (i = 0; i <= DRM_MAX_ORDER; i++) { if (dma->bufs[i].seg_count) { DRM_DEBUG("order %d: buf_count = %d," " seg_count = %d\n", i, dma->bufs[i].buf_count, dma->bufs[i].seg_count); for (j = 0; j < dma->bufs[i].seg_count; j++) { if (dma->bufs[i].seglist[j]) { drm_pci_free(dev, dma->bufs[i].seglist[j]); } } kfree(dma->bufs[i].seglist); } if (dma->bufs[i].buf_count) { for (j = 0; j < dma->bufs[i].buf_count; j++) { kfree(dma->bufs[i].buflist[j].dev_private); } kfree(dma->bufs[i].buflist); } } kfree(dma->buflist); kfree(dma->pagelist); kfree(dev->dma); dev->dma = NULL; } /** * Free a buffer. * * \param dev DRM device. * \param buf buffer to free. * * Resets the fields of \p buf. */ void drm_legacy_free_buffer(struct drm_device *dev, struct drm_buf * buf) { if (!buf) return; buf->waiting = 0; buf->pending = 0; buf->file_priv = NULL; buf->used = 0; } /** * Reclaim the buffers. * * \param file_priv DRM file private. * * Frees each buffer associated with \p file_priv not already on the hardware. */ void drm_legacy_reclaim_buffers(struct drm_device *dev, struct drm_file *file_priv) { struct drm_device_dma *dma = dev->dma; int i; if (!dma) return; for (i = 0; i < dma->buf_count; i++) { if (dma->buflist[i]->file_priv == file_priv) { switch (dma->buflist[i]->list) { case DRM_LIST_NONE: drm_legacy_free_buffer(dev, dma->buflist[i]); break; case DRM_LIST_WAIT: dma->buflist[i]->list = DRM_LIST_RECLAIM; break; default: /* Buffer already on hardware. */ break; } } } } |
45747 45775 45758 45757 45767 45732 45763 45757 45683 45688 45682 45680 45736 45754 45765 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | // SPDX-License-Identifier: GPL-2.0 /* * preemptoff and irqoff tracepoints * * Copyright (C) Joel Fernandes (Google) <joel@joelfernandes.org> */ #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> #include "trace.h" #define CREATE_TRACE_POINTS #include <trace/events/preemptirq.h> #ifdef CONFIG_TRACE_IRQFLAGS /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { if (!in_nmi()) trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); void trace_hardirqs_off(void) { if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); if (!in_nmi()) trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); } lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { if (this_cpu_read(tracing_irq_cpu)) { if (!in_nmi()) trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); tracer_hardirqs_on(CALLER_ADDR0, caller_addr); this_cpu_write(tracing_irq_cpu, 0); } lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { lockdep_hardirqs_off(CALLER_ADDR0); if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, caller_addr); if (!in_nmi()) trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); } } EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_TRACE_IRQFLAGS */ #ifdef CONFIG_TRACE_PREEMPT_TOGGLE void trace_preempt_on(unsigned long a0, unsigned long a1) { if (!in_nmi()) trace_preempt_enable_rcuidle(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { if (!in_nmi()) trace_preempt_disable_rcuidle(a0, a1); tracer_preempt_off(a0, a1); } #endif |
8 8 8 8 4 8 2 2 2 2 8 8 8 1 1 9 9 9 9 9 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 3 2 3 2 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 | /************************************************************************** * * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA. * Copyright 2016 Intel Corporation * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. * * **************************************************************************/ /* * Generic simple memory manager implementation. Intended to be used as a base * class implementation for more advanced memory managers. * * Note that the algorithm used is quite simple and there might be substantial * performance gains if a smarter free list is implemented. Currently it is * just an unordered stack of free regions. This could easily be improved if * an RB-tree is used instead. At least if we expect heavy fragmentation. * * Aligned allocations can also see improvement. * * Authors: * Thomas Hellström <thomas-at-tungstengraphics-dot-com> */ #include <drm/drmP.h> #include <drm/drm_mm.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/export.h> #include <linux/interval_tree_generic.h> /** * DOC: Overview * * drm_mm provides a simple range allocator. The drivers are free to use the * resource allocator from the linux core if it suits them, the upside of drm_mm * is that it's in the DRM core. Which means that it's easier to extend for * some of the crazier special purpose needs of gpus. * * The main data struct is &drm_mm, allocations are tracked in &drm_mm_node. * Drivers are free to embed either of them into their own suitable * datastructures. drm_mm itself will not do any memory allocations of its own, * so if drivers choose not to embed nodes they need to still allocate them * themselves. * * The range allocator also supports reservation of preallocated blocks. This is * useful for taking over initial mode setting configurations from the firmware, * where an object needs to be created which exactly matches the firmware's * scanout target. As long as the range is still free it can be inserted anytime * after the allocator is initialized, which helps with avoiding looped * dependencies in the driver load sequence. * * drm_mm maintains a stack of most recently freed holes, which of all * simplistic datastructures seems to be a fairly decent approach to clustering * allocations and avoiding too much fragmentation. This means free space * searches are O(num_holes). Given that all the fancy features drm_mm supports * something better would be fairly complex and since gfx thrashing is a fairly * steep cliff not a real concern. Removing a node again is O(1). * * drm_mm supports a few features: Alignment and range restrictions can be * supplied. Furthermore every &drm_mm_node has a color value (which is just an * opaque unsigned long) which in conjunction with a driver callback can be used * to implement sophisticated placement restrictions. The i915 DRM driver uses * this to implement guard pages between incompatible caching domains in the * graphics TT. * * Two behaviors are supported for searching and allocating: bottom-up and * top-down. The default is bottom-up. Top-down allocation can be used if the * memory area has different restrictions, or just to reduce fragmentation. * * Finally iteration helpers to walk all nodes and all holes are provided as are * some basic allocator dumpers for debugging. * * Note that this range allocator is not thread-safe, drivers need to protect * modifications with their own locking. The idea behind this is that for a full * memory manager additional data needs to be protected anyway, hence internal * locking would be fully redundant. */ #ifdef CONFIG_DRM_DEBUG_MM #include <linux/stackdepot.h> #define STACKDEPTH 32 #define BUFSZ 4096 static noinline void save_stack(struct drm_mm_node *node) { unsigned long entries[STACKDEPTH]; struct stack_trace trace = { .entries = entries, .max_entries = STACKDEPTH, .skip = 1 }; save_stack_trace(&trace); if (trace.nr_entries != 0 && trace.entries[trace.nr_entries-1] == ULONG_MAX) trace.nr_entries--; /* May be called under spinlock, so avoid sleeping */ node->stack = depot_save_stack(&trace, GFP_NOWAIT); } static void show_leaks(struct drm_mm *mm) { struct drm_mm_node *node; unsigned long entries[STACKDEPTH]; char *buf; buf = kmalloc(BUFSZ, GFP_KERNEL); if (!buf) return; list_for_each_entry(node, drm_mm_nodes(mm), node_list) { struct stack_trace trace = { .entries = entries, .max_entries = STACKDEPTH }; if (!node->stack) { DRM_ERROR("node [%08llx + %08llx]: unknown owner\n", node->start, node->size); continue; } depot_fetch_stack(node->stack, &trace); snprint_stack_trace(buf, BUFSZ, &trace, 0); DRM_ERROR("node [%08llx + %08llx]: inserted at\n%s", node->start, node->size, buf); } kfree(buf); } #undef STACKDEPTH #undef BUFSZ #else static void save_stack(struct drm_mm_node *node) { } static void show_leaks(struct drm_mm *mm) { } #endif #define START(node) ((node)->start) #define LAST(node) ((node)->start + (node)->size - 1) INTERVAL_TREE_DEFINE(struct drm_mm_node, rb, u64, __subtree_last, START, LAST, static inline, drm_mm_interval_tree) struct drm_mm_node * __drm_mm_interval_first(const struct drm_mm *mm, u64 start, u64 last) { return drm_mm_interval_tree_iter_first((struct rb_root_cached *)&mm->interval_tree, start, last) ?: (struct drm_mm_node *)&mm->head_node; } EXPORT_SYMBOL(__drm_mm_interval_first); static void drm_mm_interval_tree_add_node(struct drm_mm_node *hole_node, struct drm_mm_node *node) { struct drm_mm *mm = hole_node->mm; struct rb_node **link, *rb; struct drm_mm_node *parent; bool leftmost; node->__subtree_last = LAST(node); if (hole_node->allocated) { rb = &hole_node->rb; while (rb) { parent = rb_entry(rb, struct drm_mm_node, rb); if (parent->__subtree_last >= node->__subtree_last) break; parent->__subtree_last = node->__subtree_last; rb = rb_parent(rb); } rb = &hole_node->rb; link = &hole_node->rb.rb_right; leftmost = false; } else { rb = NULL; link = &mm->interval_tree.rb_root.rb_node; leftmost = true; } while (*link) { rb = *link; parent = rb_entry(rb, struct drm_mm_node, rb); if (parent->__subtree_last < node->__subtree_last) parent->__subtree_last = node->__subtree_last; if (node->start < parent->start) { link = &parent->rb.rb_left; } else { link = &parent->rb.rb_right; leftmost = false; } } rb_link_node(&node->rb, rb, link); rb_insert_augmented_cached(&node->rb, &mm->interval_tree, leftmost, &drm_mm_interval_tree_augment); } #define RB_INSERT(root, member, expr) do { \ struct rb_node **link = &root.rb_node, *rb = NULL; \ u64 x = expr(node); \ while (*link) { \ rb = *link; \ if (x < expr(rb_entry(rb, struct drm_mm_node, member))) \ link = &rb->rb_left; \ else \ link = &rb->rb_right; \ } \ rb_link_node(&node->member, rb, link); \ rb_insert_color(&node->member, &root); \ } while (0) #define HOLE_SIZE(NODE) ((NODE)->hole_size) #define HOLE_ADDR(NODE) (__drm_mm_hole_node_start(NODE)) static u64 rb_to_hole_size(struct rb_node *rb) { return rb_entry(rb, struct drm_mm_node, rb_hole_size)->hole_size; } static void insert_hole_size(struct rb_root_cached *root, struct drm_mm_node *node) { struct rb_node **link = &root->rb_root.rb_node, *rb = NULL; u64 x = node->hole_size; bool first = true; while (*link) { rb = *link; if (x > rb_to_hole_size(rb)) { link = &rb->rb_left; } else { link = &rb->rb_right; first = false; } } rb_link_node(&node->rb_hole_size, rb, link); rb_insert_color_cached(&node->rb_hole_size, root, first); } static void add_hole(struct drm_mm_node *node) { struct drm_mm *mm = node->mm; node->hole_size = __drm_mm_hole_node_end(node) - __drm_mm_hole_node_start(node); DRM_MM_BUG_ON(!drm_mm_hole_follows(node)); insert_hole_size(&mm->holes_size, node); RB_INSERT(mm->holes_addr, rb_hole_addr, HOLE_ADDR); list_add(&node->hole_stack, &mm->hole_stack); } static void rm_hole(struct drm_mm_node *node) { DRM_MM_BUG_ON(!drm_mm_hole_follows(node)); list_del(&node->hole_stack); rb_erase_cached(&node->rb_hole_size, &node->mm->holes_size); rb_erase(&node->rb_hole_addr, &node->mm->holes_addr); node->hole_size = 0; DRM_MM_BUG_ON(drm_mm_hole_follows(node)); } static inline struct drm_mm_node *rb_hole_size_to_node(struct rb_node *rb) { return rb_entry_safe(rb, struct drm_mm_node, rb_hole_size); } static inline struct drm_mm_node *rb_hole_addr_to_node(struct rb_node *rb) { return rb_entry_safe(rb, struct drm_mm_node, rb_hole_addr); } static inline u64 rb_hole_size(struct rb_node *rb) { return rb_entry(rb, struct drm_mm_node, rb_hole_size)->hole_size; } static struct drm_mm_node *best_hole(struct drm_mm *mm, u64 size) { struct rb_node *rb = mm->holes_size.rb_root.rb_node; struct drm_mm_node *best = NULL; do { struct drm_mm_node *node = rb_entry(rb, struct drm_mm_node, rb_hole_size); if (size <= node->hole_size) { best = node; rb = rb->rb_right; } else { rb = rb->rb_left; } } while (rb); return best; } static struct drm_mm_node *find_hole(struct drm_mm *mm, u64 addr) { struct rb_node *rb = mm->holes_addr.rb_node; struct drm_mm_node *node = NULL; while (rb) { u64 hole_start; node = rb_hole_addr_to_node(rb); hole_start = __drm_mm_hole_node_start(node); if (addr < hole_start) rb = node->rb_hole_addr.rb_left; else if (addr > hole_start + node->hole_size) rb = node->rb_hole_addr.rb_right; else break; } return node; } static struct drm_mm_node * first_hole(struct drm_mm *mm, u64 start, u64 end, u64 size, enum drm_mm_insert_mode mode) { switch (mode) { default: case DRM_MM_INSERT_BEST: return best_hole(mm, size); case DRM_MM_INSERT_LOW: return find_hole(mm, start); case DRM_MM_INSERT_HIGH: return find_hole(mm, end); case DRM_MM_INSERT_EVICT: return list_first_entry_or_null(&mm->hole_stack, struct drm_mm_node, hole_stack); } } static struct drm_mm_node * next_hole(struct drm_mm *mm, struct drm_mm_node *node, enum drm_mm_insert_mode mode) { switch (mode) { default: case DRM_MM_INSERT_BEST: return rb_hole_size_to_node(rb_prev(&node->rb_hole_size)); case DRM_MM_INSERT_LOW: return rb_hole_addr_to_node(rb_next(&node->rb_hole_addr)); case DRM_MM_INSERT_HIGH: return rb_hole_addr_to_node(rb_prev(&node->rb_hole_addr)); case DRM_MM_INSERT_EVICT: node = list_next_entry(node, hole_stack); return &node->hole_stack == &mm->hole_stack ? NULL : node; } } /** * drm_mm_reserve_node - insert an pre-initialized node * @mm: drm_mm allocator to insert @node into * @node: drm_mm_node to insert * * This functions inserts an already set-up &drm_mm_node into the allocator, * meaning that start, size and color must be set by the caller. All other * fields must be cleared to 0. This is useful to initialize the allocator with * preallocated objects which must be set-up before the range allocator can be * set-up, e.g. when taking over a firmware framebuffer. * * Returns: * 0 on success, -ENOSPC if there's no hole where @node is. */ int drm_mm_reserve_node(struct drm_mm *mm, struct drm_mm_node *node) { u64 end = node->start + node->size; struct drm_mm_node *hole; u64 hole_start, hole_end; u64 adj_start, adj_end; end = node->start + node->size; if (unlikely(end <= node->start)) return -ENOSPC; /* Find the relevant hole to add our node to */ hole = find_hole(mm, node->start); if (!hole) return -ENOSPC; adj_start = hole_start = __drm_mm_hole_node_start(hole); adj_end = hole_end = hole_start + hole->hole_size; if (mm->color_adjust) mm->color_adjust(hole, node->color, &adj_start, &adj_end); if (adj_start > node->start || adj_end < end) return -ENOSPC; node->mm = mm; list_add(&node->node_list, &hole->node_list); drm_mm_interval_tree_add_node(hole, node); node->allocated = true; node->hole_size = 0; rm_hole(hole); if (node->start > hole_start) add_hole(hole); if (end < hole_end) add_hole(node); save_stack(node); return 0; } EXPORT_SYMBOL(drm_mm_reserve_node); static u64 rb_to_hole_size_or_zero(struct rb_node *rb) { return rb ? rb_to_hole_size(rb) : 0; } /** * drm_mm_insert_node_in_range - ranged search for space and insert @node * @mm: drm_mm to allocate from * @node: preallocate node to insert * @size: size of the allocation * @alignment: alignment of the allocation * @color: opaque tag value to use for this node * @range_start: start of the allowed range for this node * @range_end: end of the allowed range for this node * @mode: fine-tune the allocation search and placement * * The preallocated @node must be cleared to 0. * * Returns: * 0 on success, -ENOSPC if there's no suitable hole. */ int drm_mm_insert_node_in_range(struct drm_mm * const mm, struct drm_mm_node * const node, u64 size, u64 alignment, unsigned long color, u64 range_start, u64 range_end, enum drm_mm_insert_mode mode) { struct drm_mm_node *hole; u64 remainder_mask; bool once; DRM_MM_BUG_ON(range_start >= range_end); if (unlikely(size == 0 || range_end - range_start < size)) return -ENOSPC; if (rb_to_hole_size_or_zero(rb_first_cached(&mm->holes_size)) < size) return -ENOSPC; if (alignment <= 1) alignment = 0; once = mode & DRM_MM_INSERT_ONCE; mode &= ~DRM_MM_INSERT_ONCE; remainder_mask = is_power_of_2(alignment) ? alignment - 1 : 0; for (hole = first_hole(mm, range_start, range_end, size, mode); hole; hole = once ? NULL : next_hole(mm, hole, mode)) { u64 hole_start = __drm_mm_hole_node_start(hole); u64 hole_end = hole_start + hole->hole_size; u64 adj_start, adj_end; u64 col_start, col_end; if (mode == DRM_MM_INSERT_LOW && hole_start >= range_end) break; if (mode == DRM_MM_INSERT_HIGH && hole_end <= range_start) break; col_start = hole_start; col_end = hole_end; if (mm->color_adjust) mm->color_adjust(hole, color, &col_start, &col_end); adj_start = max(col_start, range_start); adj_end = min(col_end, range_end); if (adj_end <= adj_start || adj_end - adj_start < size) continue; if (mode == DRM_MM_INSERT_HIGH) adj_start = adj_end - size; if (alignment) { u64 rem; if (likely(remainder_mask)) rem = adj_start & remainder_mask; else div64_u64_rem(adj_start, alignment, &rem); if (rem) { adj_start -= rem; if (mode != DRM_MM_INSERT_HIGH) adj_start += alignment; if (adj_start < max(col_start, range_start) || min(col_end, range_end) - adj_start < size) continue; if (adj_end <= adj_start || adj_end - adj_start < size) continue; } } node->mm = mm; node->size = size; node->start = adj_start; node->color = color; node->hole_size = 0; list_add(&node->node_list, &hole->node_list); drm_mm_interval_tree_add_node(hole, node); node->allocated = true; rm_hole(hole); if (adj_start > hole_start) add_hole(hole); if (adj_start + size < hole_end) add_hole(node); save_stack(node); return 0; } return -ENOSPC; } EXPORT_SYMBOL(drm_mm_insert_node_in_range); /** * drm_mm_remove_node - Remove a memory node from the allocator. * @node: drm_mm_node to remove * * This just removes a node from its drm_mm allocator. The node does not need to * be cleared again before it can be re-inserted into this or any other drm_mm * allocator. It is a bug to call this function on a unallocated node. */ void drm_mm_remove_node(struct drm_mm_node *node) { struct drm_mm *mm = node->mm; struct drm_mm_node *prev_node; DRM_MM_BUG_ON(!node->allocated); DRM_MM_BUG_ON(node->scanned_block); prev_node = list_prev_entry(node, node_list); if (drm_mm_hole_follows(node)) rm_hole(node); drm_mm_interval_tree_remove(node, &mm->interval_tree); list_del(&node->node_list); node->allocated = false; if (drm_mm_hole_follows(prev_node)) rm_hole(prev_node); add_hole(prev_node); } EXPORT_SYMBOL(drm_mm_remove_node); /** * drm_mm_replace_node - move an allocation from @old to @new * @old: drm_mm_node to remove from the allocator * @new: drm_mm_node which should inherit @old's allocation * * This is useful for when drivers embed the drm_mm_node structure and hence * can't move allocations by reassigning pointers. It's a combination of remove * and insert with the guarantee that the allocation start will match. */ void drm_mm_replace_node(struct drm_mm_node *old, struct drm_mm_node *new) { struct drm_mm *mm = old->mm; DRM_MM_BUG_ON(!old->allocated); *new = *old; list_replace(&old->node_list, &new->node_list); rb_replace_node_cached(&old->rb, &new->rb, &mm->interval_tree); if (drm_mm_hole_follows(old)) { list_replace(&old->hole_stack, &new->hole_stack); rb_replace_node_cached(&old->rb_hole_size, &new->rb_hole_size, &mm->holes_size); rb_replace_node(&old->rb_hole_addr, &new->rb_hole_addr, &mm->holes_addr); } old->allocated = false; new->allocated = true; } EXPORT_SYMBOL(drm_mm_replace_node); /** * DOC: lru scan roster * * Very often GPUs need to have continuous allocations for a given object. When * evicting objects to make space for a new one it is therefore not most * efficient when we simply start to select all objects from the tail of an LRU * until there's a suitable hole: Especially for big objects or nodes that * otherwise have special allocation constraints there's a good chance we evict * lots of (smaller) objects unnecessarily. * * The DRM range allocator supports this use-case through the scanning * interfaces. First a scan operation needs to be initialized with * drm_mm_scan_init() or drm_mm_scan_init_with_range(). The driver adds * objects to the roster, probably by walking an LRU list, but this can be * freely implemented. Eviction candiates are added using * drm_mm_scan_add_block() until a suitable hole is found or there are no * further evictable objects. Eviction roster metadata is tracked in &struct * drm_mm_scan. * * The driver must walk through all objects again in exactly the reverse * order to restore the allocator state. Note that while the allocator is used * in the scan mode no other operation is allowed. * * Finally the driver evicts all objects selected (drm_mm_scan_remove_block() * reported true) in the scan, and any overlapping nodes after color adjustment * (drm_mm_scan_color_evict()). Adding and removing an object is O(1), and * since freeing a node is also O(1) the overall complexity is * O(scanned_objects). So like the free stack which needs to be walked before a * scan operation even begins this is linear in the number of objects. It * doesn't seem to hurt too badly. */ /** * drm_mm_scan_init_with_range - initialize range-restricted lru scanning * @scan: scan state * @mm: drm_mm to scan * @size: size of the allocation * @alignment: alignment of the allocation * @color: opaque tag value to use for the allocation * @start: start of the allowed range for the allocation * @end: end of the allowed range for the allocation * @mode: fine-tune the allocation search and placement * * This simply sets up the scanning routines with the parameters for the desired * hole. * * Warning: * As long as the scan list is non-empty, no other operations than * adding/removing nodes to/from the scan list are allowed. */ void drm_mm_scan_init_with_range(struct drm_mm_scan *scan, struct drm_mm *mm, u64 size, u64 alignment, unsigned long color, u64 start, u64 end, enum drm_mm_insert_mode mode) { DRM_MM_BUG_ON(start >= end); DRM_MM_BUG_ON(!size || size > end - start); DRM_MM_BUG_ON(mm->scan_active); scan->mm = mm; if (alignment <= 1) alignment = 0; scan->color = color; scan->alignment = alignment; scan->remainder_mask = is_power_of_2(alignment) ? alignment - 1 : 0; scan->size = size; scan->mode = mode; DRM_MM_BUG_ON(end <= start); scan->range_start = start; scan->range_end = end; scan->hit_start = U64_MAX; scan->hit_end = 0; } EXPORT_SYMBOL(drm_mm_scan_init_with_range); /** * drm_mm_scan_add_block - add a node to the scan list * @scan: the active drm_mm scanner * @node: drm_mm_node to add * * Add a node to the scan list that might be freed to make space for the desired * hole. * * Returns: * True if a hole has been found, false otherwise. */ bool drm_mm_scan_add_block(struct drm_mm_scan *scan, struct drm_mm_node *node) { struct drm_mm *mm = scan->mm; struct drm_mm_node *hole; u64 hole_start, hole_end; u64 col_start, col_end; u64 adj_start, adj_end; DRM_MM_BUG_ON(node->mm != mm); DRM_MM_BUG_ON(!node->allocated); DRM_MM_BUG_ON(node->scanned_block); node->scanned_block = true; mm->scan_active++; /* Remove this block from the node_list so that we enlarge the hole * (distance between the end of our previous node and the start of * or next), without poisoning the link so that we can restore it * later in drm_mm_scan_remove_block(). */ hole = list_prev_entry(node, node_list); DRM_MM_BUG_ON(list_next_entry(hole, node_list) != node); __list_del_entry(&node->node_list); hole_start = __drm_mm_hole_node_start(hole); hole_end = __drm_mm_hole_node_end(hole); col_start = hole_start; col_end = hole_end; if (mm->color_adjust) mm->color_adjust(hole, scan->color, &col_start, &col_end); adj_start = max(col_start, scan->range_start); adj_end = min(col_end, scan->range_end); if (adj_end <= adj_start || adj_end - adj_start < scan->size) return false; if (scan->mode == DRM_MM_INSERT_HIGH) adj_start = adj_end - scan->size; if (scan->alignment) { u64 rem; if (likely(scan->remainder_mask)) rem = adj_start & scan->remainder_mask; else div64_u64_rem(adj_start, scan->alignment, &rem); if (rem) { adj_start -= rem; if (scan->mode != DRM_MM_INSERT_HIGH) adj_start += scan->alignment; if (adj_start < max(col_start, scan->range_start) || min(col_end, scan->range_end) - adj_start < scan->size) return false; if (adj_end <= adj_start || adj_end - adj_start < scan->size) return false; } } scan->hit_start = adj_start; scan->hit_end = adj_start + scan->size; DRM_MM_BUG_ON(scan->hit_start >= scan->hit_end); DRM_MM_BUG_ON(scan->hit_start < hole_start); DRM_MM_BUG_ON(scan->hit_end > hole_end); return true; } EXPORT_SYMBOL(drm_mm_scan_add_block); /** * drm_mm_scan_remove_block - remove a node from the scan list * @scan: the active drm_mm scanner * @node: drm_mm_node to remove * * Nodes **must** be removed in exactly the reverse order from the scan list as * they have been added (e.g. using list_add() as they are added and then * list_for_each() over that eviction list to remove), otherwise the internal * state of the memory manager will be corrupted. * * When the scan list is empty, the selected memory nodes can be freed. An * immediately following drm_mm_insert_node_in_range_generic() or one of the * simpler versions of that function with !DRM_MM_SEARCH_BEST will then return * the just freed block (because its at the top of the free_stack list). * * Returns: * True if this block should be evicted, false otherwise. Will always * return false when no hole has been found. */ bool drm_mm_scan_remove_block(struct drm_mm_scan *scan, struct drm_mm_node *node) { struct drm_mm_node *prev_node; DRM_MM_BUG_ON(node->mm != scan->mm); DRM_MM_BUG_ON(!node->scanned_block); node->scanned_block = false; DRM_MM_BUG_ON(!node->mm->scan_active); node->mm->scan_active--; /* During drm_mm_scan_add_block() we decoupled this node leaving * its pointers intact. Now that the caller is walking back along * the eviction list we can restore this block into its rightful * place on the full node_list. To confirm that the caller is walking * backwards correctly we check that prev_node->next == node->next, * i.e. both believe the same node should be on the other side of the * hole. */ prev_node = list_prev_entry(node, node_list); DRM_MM_BUG_ON(list_next_entry(prev_node, node_list) != list_next_entry(node, node_list)); list_add(&node->node_list, &prev_node->node_list); return (node->start + node->size > scan->hit_start && node->start < scan->hit_end); } EXPORT_SYMBOL(drm_mm_scan_remove_block); /** * drm_mm_scan_color_evict - evict overlapping nodes on either side of hole * @scan: drm_mm scan with target hole * * After completing an eviction scan and removing the selected nodes, we may * need to remove a few more nodes from either side of the target hole if * mm.color_adjust is being used. * * Returns: * A node to evict, or NULL if there are no overlapping nodes. */ struct drm_mm_node *drm_mm_scan_color_evict(struct drm_mm_scan *scan) { struct drm_mm *mm = scan->mm; struct drm_mm_node *hole; u64 hole_start, hole_end; DRM_MM_BUG_ON(list_empty(&mm->hole_stack)); if (!mm->color_adjust) return NULL; /* * The hole found during scanning should ideally be the first element * in the hole_stack list, but due to side-effects in the driver it * may not be. */ list_for_each_entry(hole, &mm->hole_stack, hole_stack) { hole_start = __drm_mm_hole_node_start(hole); hole_end = hole_start + hole->hole_size; if (hole_start <= scan->hit_start && hole_end >= scan->hit_end) break; } /* We should only be called after we found the hole previously */ DRM_MM_BUG_ON(&hole->hole_stack == &mm->hole_stack); if (unlikely(&hole->hole_stack == &mm->hole_stack)) return NULL; DRM_MM_BUG_ON(hole_start > scan->hit_start); DRM_MM_BUG_ON(hole_end < scan->hit_end); mm->color_adjust(hole, scan->color, &hole_start, &hole_end); if (hole_start > scan->hit_start) return hole; if (hole_end < scan->hit_end) return list_next_entry(hole, node_list); return NULL; } EXPORT_SYMBOL(drm_mm_scan_color_evict); /** * drm_mm_init - initialize a drm-mm allocator * @mm: the drm_mm structure to initialize * @start: start of the range managed by @mm * @size: end of the range managed by @mm * * Note that @mm must be cleared to 0 before calling this function. */ void drm_mm_init(struct drm_mm *mm, u64 start, u64 size) { DRM_MM_BUG_ON(start + size <= start); mm->color_adjust = NULL; INIT_LIST_HEAD(&mm->hole_stack); mm->interval_tree = RB_ROOT_CACHED; mm->holes_size = RB_ROOT_CACHED; mm->holes_addr = RB_ROOT; /* Clever trick to avoid a special case in the free hole tracking. */ INIT_LIST_HEAD(&mm->head_node.node_list); mm->head_node.allocated = false; mm->head_node.mm = mm; mm->head_node.start = start + size; mm->head_node.size = -size; add_hole(&mm->head_node); mm->scan_active = 0; } EXPORT_SYMBOL(drm_mm_init); /** * drm_mm_takedown - clean up a drm_mm allocator * @mm: drm_mm allocator to clean up * * Note that it is a bug to call this function on an allocator which is not * clean. */ void drm_mm_takedown(struct drm_mm *mm) { if (WARN(!drm_mm_clean(mm), "Memory manager not clean during takedown.\n")) show_leaks(mm); } EXPORT_SYMBOL(drm_mm_takedown); static u64 drm_mm_dump_hole(struct drm_printer *p, const struct drm_mm_node *entry) { u64 start, size; size = entry->hole_size; if (size) { start = drm_mm_hole_node_start(entry); drm_printf(p, "%#018llx-%#018llx: %llu: free\n", start, start + size, size); } return size; } /** * drm_mm_print - print allocator state * @mm: drm_mm allocator to print * @p: DRM printer to use */ void drm_mm_print(const struct drm_mm *mm, struct drm_printer *p) { const struct drm_mm_node *entry; u64 total_used = 0, total_free = 0, total = 0; total_free += drm_mm_dump_hole(p, &mm->head_node); drm_mm_for_each_node(entry, mm) { drm_printf(p, "%#018llx-%#018llx: %llu: used\n", entry->start, entry->start + entry->size, entry->size); total_used += entry->size; total_free += drm_mm_dump_hole(p, entry); } total = total_free + total_used; drm_printf(p, "total: %llu, used %llu free %llu\n", total, total_used, total_free); } EXPORT_SYMBOL(drm_mm_print); |
20 19 20 20 20 2 1 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | /* * CBC: Cipher Block Chaining mode * * Copyright (c) 2016 Herbert Xu <herbert@gondor.apana.org.au> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #ifndef _CRYPTO_CBC_H #define _CRYPTO_CBC_H #include <crypto/internal/skcipher.h> #include <linux/string.h> #include <linux/types.h> static inline int crypto_cbc_encrypt_segment( struct skcipher_walk *walk, struct crypto_skcipher *tfm, void (*fn)(struct crypto_skcipher *, const u8 *, u8 *)) { unsigned int bsize = crypto_skcipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; u8 *iv = walk->iv; do { crypto_xor(iv, src, bsize); fn(tfm, iv, dst); memcpy(iv, dst, bsize); src += bsize; dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static inline int crypto_cbc_encrypt_inplace( struct skcipher_walk *walk, struct crypto_skcipher *tfm, void (*fn)(struct crypto_skcipher *, const u8 *, u8 *)) { unsigned int bsize = crypto_skcipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *src = walk->src.virt.addr; u8 *iv = walk->iv; do { crypto_xor(src, iv, bsize); fn(tfm, src, src); iv = src; src += bsize; } while ((nbytes -= bsize) >= bsize); memcpy(walk->iv, iv, bsize); return nbytes; } static inline int crypto_cbc_encrypt_walk(struct skcipher_request *req, void (*fn)(struct crypto_skcipher *, const u8 *, u8 *)) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct skcipher_walk walk; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { if (walk.src.virt.addr == walk.dst.virt.addr) err = crypto_cbc_encrypt_inplace(&walk, tfm, fn); else err = crypto_cbc_encrypt_segment(&walk, tfm, fn); err = skcipher_walk_done(&walk, err); } return err; } static inline int crypto_cbc_decrypt_segment( struct skcipher_walk *walk, struct crypto_skcipher *tfm, void (*fn)(struct crypto_skcipher *, const u8 *, u8 *)) { unsigned int bsize = crypto_skcipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; u8 *iv = walk->iv; do { fn(tfm, src, dst); crypto_xor(dst, iv, bsize); iv = src; src += bsize; dst += bsize; } while ((nbytes -= bsize) >= bsize); memcpy(walk->iv, iv, bsize); return nbytes; } static inline int crypto_cbc_decrypt_inplace( struct skcipher_walk *walk, struct crypto_skcipher *tfm, void (*fn)(struct crypto_skcipher *, const u8 *, u8 *)) { unsigned int bsize = crypto_skcipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *src = walk->src.virt.addr; u8 last_iv[bsize]; /* Start of the last block. */ src += nbytes - (nbytes & (bsize - 1)) - bsize; memcpy(last_iv, src, bsize); for (;;) { fn(tfm, src, src); if ((nbytes -= bsize) < bsize) break; crypto_xor(src, src - bsize, bsize); src -= bsize; } crypto_xor(src, walk->iv, bsize); memcpy(walk->iv, last_iv, bsize); return nbytes; } static inline int crypto_cbc_decrypt_blocks( struct skcipher_walk *walk, struct crypto_skcipher *tfm, void (*fn)(struct crypto_skcipher *, const u8 *, u8 *)) { if (walk->src.virt.addr == walk->dst.virt.addr) return crypto_cbc_decrypt_inplace(walk, tfm, fn); else return crypto_cbc_decrypt_segment(walk, tfm, fn); } #endif /* _CRYPTO_CBC_H */ |
8 8 4 8 8 8 8 8 8 7 7 7 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | /* * Scatterlist Cryptographic API. * * Procfs information. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #include <linux/atomic.h> #include <linux/init.h> #include <linux/crypto.h> #include <linux/module.h> /* for module_name() */ #include <linux/rwsem.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include "internal.h" static void *c_start(struct seq_file *m, loff_t *pos) { down_read(&crypto_alg_sem); return seq_list_start(&crypto_alg_list, *pos); } static void *c_next(struct seq_file *m, void *p, loff_t *pos) { return seq_list_next(p, &crypto_alg_list, pos); } static void c_stop(struct seq_file *m, void *p) { up_read(&crypto_alg_sem); } static int c_show(struct seq_file *m, void *p) { struct crypto_alg *alg = list_entry(p, struct crypto_alg, cra_list); seq_printf(m, "name : %s\n", alg->cra_name); seq_printf(m, "driver : %s\n", alg->cra_driver_name); seq_printf(m, "module : %s\n", module_name(alg->cra_module)); seq_printf(m, "priority : %d\n", alg->cra_priority); seq_printf(m, "refcnt : %u\n", refcount_read(&alg->cra_refcnt)); seq_printf(m, "selftest : %s\n", (alg->cra_flags & CRYPTO_ALG_TESTED) ? "passed" : "unknown"); seq_printf(m, "internal : %s\n", (alg->cra_flags & CRYPTO_ALG_INTERNAL) ? "yes" : "no"); if (alg->cra_flags & CRYPTO_ALG_LARVAL) { seq_printf(m, "type : larval\n"); seq_printf(m, "flags : 0x%x\n", alg->cra_flags); goto out; } if (alg->cra_type && alg->cra_type->show) { alg->cra_type->show(m, alg); goto out; } switch (alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL)) { case CRYPTO_ALG_TYPE_CIPHER: seq_printf(m, "type : cipher\n"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "min keysize : %u\n", alg->cra_cipher.cia_min_keysize); seq_printf(m, "max keysize : %u\n", alg->cra_cipher.cia_max_keysize); break; case CRYPTO_ALG_TYPE_COMPRESS: seq_printf(m, "type : compression\n"); break; default: seq_printf(m, "type : unknown\n"); break; } out: seq_putc(m, '\n'); return 0; } static const struct seq_operations crypto_seq_ops = { .start = c_start, .next = c_next, .stop = c_stop, .show = c_show }; void __init crypto_init_proc(void) { proc_create_seq("crypto", 0, NULL, &crypto_seq_ops); } void __exit crypto_exit_proc(void) { remove_proc_entry("crypto", NULL); } |
4 58 57 52 52 52 52 52 52 58 58 52 52 52 58 19 19 19 19 19 18 1 19 19 19 19 19 12 12 12 12 12 12 12 12 12 10 10 31 30 19 12 12 28 28 1 28 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 | // SPDX-License-Identifier: GPL-2.0 /* * Functions related to generic helpers functions */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/scatterlist.h> #include "blk.h" static struct bio *next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp) { struct bio *new = bio_alloc(gfp, nr_pages); if (bio) { bio_chain(bio, new); submit_bio(bio); } return new; } int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop) { struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; unsigned int op; sector_t bs_mask; if (!q) return -ENXIO; if (bdev_read_only(bdev)) return -EPERM; if (flags & BLKDEV_DISCARD_SECURE) { if (!blk_queue_secure_erase(q)) return -EOPNOTSUPP; op = REQ_OP_SECURE_ERASE; } else { if (!blk_queue_discard(q)) return -EOPNOTSUPP; op = REQ_OP_DISCARD; } bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; if ((sector | nr_sects) & bs_mask) return -EINVAL; if (!nr_sects) return -EINVAL; while (nr_sects) { sector_t req_sects = min_t(sector_t, nr_sects, bio_allowed_max_sectors(q)); WARN_ON_ONCE((req_sects << 9) > UINT_MAX); bio = next_bio(bio, 0, gfp_mask); bio->bi_iter.bi_sector = sector; bio_set_dev(bio, bdev); bio_set_op_attrs(bio, op, 0); bio->bi_iter.bi_size = req_sects << 9; sector += req_sects; nr_sects -= req_sects; /* * We can loop for a long time in here, if someone does * full device discards (like mkfs). Be nice and allow * us to schedule out to avoid softlocking if preempt * is disabled. */ cond_resched(); } *biop = bio; return 0; } EXPORT_SYMBOL(__blkdev_issue_discard); /** * blkdev_issue_discard - queue a discard * @bdev: blockdev to issue discard for * @sector: start sector * @nr_sects: number of sectors to discard * @gfp_mask: memory allocation flags (for bio_alloc) * @flags: BLKDEV_DISCARD_* flags to control behaviour * * Description: * Issue a discard request for the sectors in question. */ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) { struct bio *bio = NULL; struct blk_plug plug; int ret; blk_start_plug(&plug); ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, &bio); if (!ret && bio) { ret = submit_bio_wait(bio); if (ret == -EOPNOTSUPP) ret = 0; bio_put(bio); } blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(blkdev_issue_discard); /** * __blkdev_issue_write_same - generate number of bios with same page * @bdev: target blockdev * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) * @page: page containing data to write * @biop: pointer to anchor bio * * Description: * Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page. */ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page, struct bio **biop) { struct request_queue *q = bdev_get_queue(bdev); unsigned int max_write_same_sectors; struct bio *bio = *biop; sector_t bs_mask; if (!q) return -ENXIO; if (bdev_read_only(bdev)) return -EPERM; bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; if ((sector | nr_sects) & bs_mask) return -EINVAL; if (!bdev_write_same(bdev)) return -EOPNOTSUPP; /* Ensure that max_write_same_sectors doesn't overflow bi_size */ max_write_same_sectors = bio_allowed_max_sectors(q); while (nr_sects) { bio = next_bio(bio, 1, gfp_mask); bio->bi_iter.bi_sector = sector; bio_set_dev(bio, bdev); bio->bi_vcnt = 1; bio->bi_io_vec->bv_page = page; bio->bi_io_vec->bv_offset = 0; bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev); bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0); if (nr_sects > max_write_same_sectors) { bio->bi_iter.bi_size = max_write_same_sectors << 9; nr_sects -= max_write_same_sectors; sector += max_write_same_sectors; } else { bio->bi_iter.bi_size = nr_sects << 9; nr_sects = 0; } cond_resched(); } *biop = bio; return 0; } /** * blkdev_issue_write_same - queue a write same operation * @bdev: target blockdev * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) * @page: page containing data * * Description: * Issue a write same request for the sectors in question. */ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page) { struct bio *bio = NULL; struct blk_plug plug; int ret; blk_start_plug(&plug); ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page, &bio); if (ret == 0 && bio) { ret = submit_bio_wait(bio); bio_put(bio); } blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(blkdev_issue_write_same); static int __blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) { struct bio *bio = *biop; unsigned int max_write_zeroes_sectors; struct request_queue *q = bdev_get_queue(bdev); if (!q) return -ENXIO; if (bdev_read_only(bdev)) return -EPERM; /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); if (max_write_zeroes_sectors == 0) return -EOPNOTSUPP; while (nr_sects) { bio = next_bio(bio, 0, gfp_mask); bio->bi_iter.bi_sector = sector; bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE_ZEROES; if (flags & BLKDEV_ZERO_NOUNMAP) bio->bi_opf |= REQ_NOUNMAP; if (nr_sects > max_write_zeroes_sectors) { bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; nr_sects -= max_write_zeroes_sectors; sector += max_write_zeroes_sectors; } else { bio->bi_iter.bi_size = nr_sects << 9; nr_sects = 0; } cond_resched(); } *biop = bio; return 0; } /* * Convert a number of 512B sectors to a number of pages. * The result is limited to a number of pages that can fit into a BIO. * Also make sure that the result is always at least 1 (page) for the cases * where nr_sects is lower than the number of sectors in a page. */ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) { sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512); return min(pages, (sector_t)BIO_MAX_PAGES); } static int __blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; int bi_size = 0; unsigned int sz; if (!q) return -ENXIO; if (bdev_read_only(bdev)) return -EPERM; while (nr_sects != 0) { bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), gfp_mask); bio->bi_iter.bi_sector = sector; bio_set_dev(bio, bdev); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE, nr_sects << 9); bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); nr_sects -= bi_size >> 9; sector += bi_size >> 9; if (bi_size < sz) break; } cond_resched(); } *biop = bio; return 0; } /** * __blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) * @biop: pointer to anchor bio * @flags: controls detailed behavior * * Description: * Zero-fill a block range, either using hardware offload or by explicitly * writing zeroes to the device. * * If a device is using logical block provisioning, the underlying space will * not be released if %flags contains BLKDEV_ZERO_NOUNMAP. * * If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return * -EOPNOTSUPP if no explicit hardware offload for zeroing is provided. */ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) { int ret; sector_t bs_mask; bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; if ((sector | nr_sects) & bs_mask) return -EINVAL; ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, biop, flags); if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK)) return ret; return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, biop); } EXPORT_SYMBOL(__blkdev_issue_zeroout); /** * blkdev_issue_zeroout - zero-fill a block range * @bdev: blockdev to write * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) * @flags: controls detailed behavior * * Description: * Zero-fill a block range, either using hardware offload or by explicitly * writing zeroes to the device. See __blkdev_issue_zeroout() for the * valid values for %flags. */ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned flags) { int ret = 0; sector_t bs_mask; struct bio *bio; struct blk_plug plug; bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev); bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; if ((sector | nr_sects) & bs_mask) return -EINVAL; retry: bio = NULL; blk_start_plug(&plug); if (try_write_zeroes) { ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, &bio, flags); } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, &bio); } else { /* No zeroing offload support */ ret = -EOPNOTSUPP; } if (ret == 0 && bio) { ret = submit_bio_wait(bio); bio_put(bio); } blk_finish_plug(&plug); if (ret && try_write_zeroes) { if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { try_write_zeroes = false; goto retry; } if (!bdev_write_zeroes_sectors(bdev)) { /* * Zeroing offload support was indicated, but the * device reported ILLEGAL REQUEST (for some devices * there is no non-destructive way to verify whether * WRITE ZEROES is actually supported). */ ret = -EOPNOTSUPP; } } return ret; } EXPORT_SYMBOL(blkdev_issue_zeroout); |
68 69 69 69 69 69 69 69 69 69 69 69 68 69 69 69 69 69 69 69 69 69 46 68 46 69 46 46 46 11 23 23 23 23 46 11 46 30 46 46 23 16 13 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 | /* * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. * * Rewritten and vastly simplified by Rusty Russell for in-kernel * module loader: * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation * * ChangeLog: * * (25/Aug/2004) Paulo Marques <pmarques@grupopie.com> * Changed the compression method from stem compression to "table lookup" * compression (see scripts/kallsyms.c for a more complete description) */ #include <linux/kallsyms.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/kdb.h> #include <linux/err.h> #include <linux/proc_fs.h> #include <linux/sched.h> /* for cond_resched */ #include <linux/ctype.h> #include <linux/slab.h> #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/compiler.h> /* * These will be re-linked against their real values * during the second link stage. */ extern const unsigned long kallsyms_addresses[] __weak; extern const int kallsyms_offsets[] __weak; extern const u8 kallsyms_names[] __weak; /* * Tell the compiler that the count isn't in the small data section if the arch * has one (eg: FRV). */ extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); extern const unsigned long kallsyms_relative_base __attribute__((weak, section(".rodata"))); extern const u8 kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; extern const unsigned long kallsyms_markers[] __weak; /* * Expand a compressed symbol data into the resulting uncompressed string, * if uncompressed string is too long (>= maxlen), it will be truncated, * given the offset to where the symbol is in the compressed stream. */ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result, size_t maxlen) { int len, skipped_first = 0; const u8 *tptr, *data; /* Get the compressed symbol length from the first symbol byte. */ data = &kallsyms_names[off]; len = *data; data++; /* * Update the offset to return the offset for the next symbol on * the compressed stream. */ off += len + 1; /* * For every byte on the compressed symbol data, copy the table * entry for that byte. */ while (len) { tptr = &kallsyms_token_table[kallsyms_token_index[*data]]; data++; len--; while (*tptr) { if (skipped_first) { if (maxlen <= 1) goto tail; *result = *tptr; result++; maxlen--; } else skipped_first = 1; tptr++; } } tail: if (maxlen) *result = '\0'; /* Return to offset to the next symbol. */ return off; } /* * Get symbol type information. This is encoded as a single char at the * beginning of the symbol name. */ static char kallsyms_get_symbol_type(unsigned int off) { /* * Get just the first code, look it up in the token table, * and return the first char from this token. */ return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]]; } /* * Find the offset on the compressed stream given and index in the * kallsyms array. */ static unsigned int get_symbol_offset(unsigned long pos) { const u8 *name; int i; /* * Use the closest marker we have. We have markers every 256 positions, * so that should be close enough. */ name = &kallsyms_names[kallsyms_markers[pos >> 8]]; /* * Sequentially scan all the symbols up to the point we're searching * for. Every symbol is stored in a [<len>][<len> bytes of data] format, * so we just need to add the len to the current pointer for every * symbol we wish to skip. */ for (i = 0; i < (pos & 0xFF); i++) name = name + (*name) + 1; return name - kallsyms_names; } static unsigned long kallsyms_sym_address(int idx) { if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) return kallsyms_addresses[idx]; /* values are unsigned offsets if --absolute-percpu is not in effect */ if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; /* ...otherwise, positive offsets are absolute values */ if (kallsyms_offsets[idx] >= 0) return kallsyms_offsets[idx]; /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; } /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { char namebuf[KSYM_NAME_LEN]; unsigned long i; unsigned int off; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(namebuf, name) == 0) return kallsyms_sym_address(i); } return module_kallsyms_lookup_name(name); } EXPORT_SYMBOL_GPL(kallsyms_lookup_name); int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data) { char namebuf[KSYM_NAME_LEN]; unsigned long i; unsigned int off; int ret; for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); ret = fn(data, namebuf, NULL, kallsyms_sym_address(i)); if (ret != 0) return ret; } return module_kallsyms_on_each_symbol(fn, data); } EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol); static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { unsigned long symbol_start = 0, symbol_end = 0; unsigned long i, low, high, mid; /* This kernel should never had been booted. */ if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) BUG_ON(!kallsyms_addresses); else BUG_ON(!kallsyms_offsets); /* Do a binary search on the sorted kallsyms_addresses array. */ low = 0; high = kallsyms_num_syms; while (high - low > 1) { mid = low + (high - low) / 2; if (kallsyms_sym_address(mid) <= addr) low = mid; else high = mid; } /* * Search for the first aliased symbol. Aliased * symbols are symbols with the same address. */ while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low)) --low; symbol_start = kallsyms_sym_address(low); /* Search for next non-aliased symbol. */ for (i = low + 1; i < kallsyms_num_syms; i++) { if (kallsyms_sym_address(i) > symbol_start) { symbol_end = kallsyms_sym_address(i); break; } } /* If we found no next symbol, we use the end of the section. */ if (!symbol_end) { if (is_kernel_inittext(addr)) symbol_end = (unsigned long)_einittext; else if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) symbol_end = (unsigned long)_end; else symbol_end = (unsigned long)_etext; } if (symbolsize) *symbolsize = symbol_end - symbol_start; if (offset) *offset = addr - symbol_start; return low; } /* * Lookup an address but don't bother to find any names. */ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { char namebuf[KSYM_NAME_LEN]; if (is_ksym_addr(addr)) { get_symbol_pos(addr, symbolsize, offset); return 1; } return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) || !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } /* * Lookup an address * - modname is set to NULL if it's in the kernel. * - We guarantee that the returned name is valid until we reschedule even if. * It resides in a module. * - We also guarantee that modname will be valid until rescheduled. */ const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf) { const char *ret; namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; if (is_ksym_addr(addr)) { unsigned long pos; pos = get_symbol_pos(addr, symbolsize, offset); /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), namebuf, KSYM_NAME_LEN); if (modname) *modname = NULL; return namebuf; } /* See if it's in a module or a BPF JITed image. */ ret = module_address_lookup(addr, symbolsize, offset, modname, namebuf); if (!ret) ret = bpf_address_lookup(addr, symbolsize, offset, modname, namebuf); if (!ret) ret = ftrace_mod_address_lookup(addr, symbolsize, offset, modname, namebuf); return ret; } int lookup_symbol_name(unsigned long addr, char *symname) { symname[0] = '\0'; symname[KSYM_NAME_LEN - 1] = '\0'; if (is_ksym_addr(addr)) { unsigned long pos; pos = get_symbol_pos(addr, NULL, NULL); /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), symname, KSYM_NAME_LEN); return 0; } /* See if it's in a module. */ return lookup_module_symbol_name(addr, symname); } int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name) { name[0] = '\0'; name[KSYM_NAME_LEN - 1] = '\0'; if (is_ksym_addr(addr)) { unsigned long pos; pos = get_symbol_pos(addr, size, offset); /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), name, KSYM_NAME_LEN); modname[0] = '\0'; return 0; } /* See if it's in a module. */ return lookup_module_symbol_attrs(addr, size, offset, modname, name); } /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, int symbol_offset, int add_offset) { char *modname; const char *name; unsigned long offset, size; int len; address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) return sprintf(buffer, "0x%lx", address - symbol_offset); if (name != buffer) strcpy(buffer, name); len = strlen(buffer); offset -= symbol_offset; if (add_offset) len += sprintf(buffer + len, "+%#lx/%#lx", offset, size); if (modname) len += sprintf(buffer + len, " [%s]", modname); return len; } /** * sprint_symbol - Look up a kernel symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function looks up a kernel symbol with @address and stores its name, * offset, size and module name to @buffer if possible. If no symbol was found, * just saves its @address as is. * * This function returns the number of bytes stored in @buffer. */ int sprint_symbol(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, 0, 1); } EXPORT_SYMBOL_GPL(sprint_symbol); /** * sprint_symbol_no_offset - Look up a kernel symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function looks up a kernel symbol with @address and stores its name * and module name to @buffer if possible. If no symbol was found, just saves * its @address as is. * * This function returns the number of bytes stored in @buffer. */ int sprint_symbol_no_offset(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, 0, 0); } EXPORT_SYMBOL_GPL(sprint_symbol_no_offset); /** * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer * @buffer: buffer to be stored * @address: address to lookup * * This function is for stack backtrace and does the same thing as * sprint_symbol() but with modified/decreased @address. If there is a * tail-call to the function marked "noreturn", gcc optimized out code after * the call so that the stack-saved return address could point outside of the * caller. This function ensures that kallsyms will find the original caller * by decreasing @address. * * This function returns the number of bytes stored in @buffer. */ int sprint_backtrace(char *buffer, unsigned long address) { return __sprint_symbol(buffer, address, -1, 1); } /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; loff_t pos_arch_end; loff_t pos_mod_end; loff_t pos_ftrace_mod_end; unsigned long value; unsigned int nameoff; /* If iterating in core kernel symbols. */ char type; char name[KSYM_NAME_LEN]; char module_name[MODULE_NAME_LEN]; int exported; int show_value; }; int __weak arch_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *name) { return -EINVAL; } static int get_ksymbol_arch(struct kallsym_iter *iter) { int ret = arch_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, &iter->type, iter->name); if (ret < 0) { iter->pos_arch_end = iter->pos; return 0; } return 1; } static int get_ksymbol_mod(struct kallsym_iter *iter) { int ret = module_get_kallsym(iter->pos - iter->pos_arch_end, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); if (ret < 0) { iter->pos_mod_end = iter->pos; return 0; } return 1; } static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) { int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end, &iter->value, &iter->type, iter->name, iter->module_name, &iter->exported); if (ret < 0) { iter->pos_ftrace_mod_end = iter->pos; return 0; } return 1; } static int get_ksymbol_bpf(struct kallsym_iter *iter) { iter->module_name[0] = '\0'; iter->exported = 0; return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, &iter->value, &iter->type, iter->name) < 0 ? 0 : 1; } /* Returns space to next name. */ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) { unsigned off = iter->nameoff; iter->module_name[0] = '\0'; iter->value = kallsyms_sym_address(iter->pos); iter->type = kallsyms_get_symbol_type(off); off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name)); return off - iter->nameoff; } static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) { iter->name[0] = '\0'; iter->nameoff = get_symbol_offset(new_pos); iter->pos = new_pos; if (new_pos == 0) { iter->pos_arch_end = 0; iter->pos_mod_end = 0; iter->pos_ftrace_mod_end = 0; } } /* * The end position (last + 1) of each additional kallsyms section is recorded * in iter->pos_..._end as each section is added, and so can be used to * determine which get_ksymbol_...() function to call next. */ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos) { iter->pos = pos; if ((!iter->pos_arch_end || iter->pos_arch_end > pos) && get_ksymbol_arch(iter)) return 1; if ((!iter->pos_mod_end || iter->pos_mod_end > pos) && get_ksymbol_mod(iter)) return 1; if ((!iter->pos_ftrace_mod_end || iter->pos_ftrace_mod_end > pos) && get_ksymbol_ftrace_mod(iter)) return 1; return get_ksymbol_bpf(iter); } /* Returns false if pos at or past end of file. */ static int update_iter(struct kallsym_iter *iter, loff_t pos) { /* Module symbols can be accessed randomly. */ if (pos >= kallsyms_num_syms) return update_iter_mod(iter, pos); /* If we're not on the desired position, reset to new position. */ if (pos != iter->pos) reset_iter(iter, pos); iter->nameoff += get_ksymbol_core(iter); iter->pos++; return 1; } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { (*pos)++; if (!update_iter(m->private, *pos)) return NULL; return p; } static void *s_start(struct seq_file *m, loff_t *pos) { if (!update_iter(m->private, *pos)) return NULL; return m->private; } static void s_stop(struct seq_file *m, void *p) { } static int s_show(struct seq_file *m, void *p) { void *value; struct kallsym_iter *iter = m->private; /* Some debugging symbols have no name. Ignore them. */ if (!iter->name[0]) return 0; value = iter->show_value ? (void *)iter->value : NULL; if (iter->module_name[0]) { char type; /* * Label it "global" if it is exported, * "local" if not exported. */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); seq_printf(m, "%px %c %s\t[%s]\n", value, type, iter->name, iter->module_name); } else seq_printf(m, "%px %c %s\n", value, iter->type, iter->name); return 0; } static const struct seq_operations kallsyms_op = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show }; static inline int kallsyms_for_perf(void) { #ifdef CONFIG_PERF_EVENTS extern int sysctl_perf_event_paranoid; if (sysctl_perf_event_paranoid <= 1) return 1; #endif return 0; } /* * We show kallsyms information even to normal users if we've enabled * kernel profiling and are explicitly not paranoid (so kptr_restrict * is clear, and sysctl_perf_event_paranoid isn't set). * * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to * block even that). */ bool kallsyms_show_value(const struct cred *cred) { switch (kptr_restrict) { case 0: if (kallsyms_for_perf()) return true; /* fallthrough */ case 1: if (security_capable(cred, &init_user_ns, CAP_SYSLOG, CAP_OPT_NOAUDIT) == 0) return true; /* fallthrough */ default: return false; } } static int kallsyms_open(struct inode *inode, struct file *file) { /* * We keep iterator in m->private, since normal case is to * s_start from where we left off, so we avoid doing * using get_symbol_offset for every symbol. */ struct kallsym_iter *iter; iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter)); if (!iter) return -ENOMEM; reset_iter(iter, 0); /* * Instead of checking this on every s_show() call, cache * the result here at open time. */ iter->show_value = kallsyms_show_value(file->f_cred); return 0; } #ifdef CONFIG_KGDB_KDB const char *kdb_walk_kallsyms(loff_t *pos) { static struct kallsym_iter kdb_walk_kallsyms_iter; if (*pos == 0) { memset(&kdb_walk_kallsyms_iter, 0, sizeof(kdb_walk_kallsyms_iter)); reset_iter(&kdb_walk_kallsyms_iter, 0); } while (1) { if (!update_iter(&kdb_walk_kallsyms_iter, *pos)) return NULL; ++*pos; /* Some debugging symbols have no name. Ignore them. */ if (kdb_walk_kallsyms_iter.name[0]) return kdb_walk_kallsyms_iter.name; } } #endif /* CONFIG_KGDB_KDB */ static const struct file_operations kallsyms_operations = { .open = kallsyms_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; static int __init kallsyms_init(void) { proc_create("kallsyms", 0444, NULL, &kallsyms_operations); return 0; } device_initcall(kallsyms_init); |
100 100 100 100 39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 | /* * Copyright (c) by Jaroslav Kysela <perex@perex.cz> * Takashi Iwai <tiwai@suse.de> * * Generic memory allocators * * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/slab.h> #include <linux/mm.h> #include <linux/dma-mapping.h> #include <linux/genalloc.h> #include <sound/memalloc.h> /* * * Generic memory allocators * */ /** * snd_malloc_pages - allocate pages with the given size * @size: the size to allocate in bytes * @gfp_flags: the allocation conditions, GFP_XXX * * Allocates the physically contiguous pages with the given size. * * Return: The pointer of the buffer, or %NULL if no enough memory. */ void *snd_malloc_pages(size_t size, gfp_t gfp_flags) { int pg; if (WARN_ON(!size)) return NULL; if (WARN_ON(!gfp_flags)) return NULL; gfp_flags |= __GFP_COMP; /* compound page lets parts be mapped */ pg = get_order(size); return (void *) __get_free_pages(gfp_flags, pg); } EXPORT_SYMBOL(snd_malloc_pages); /** * snd_free_pages - release the pages * @ptr: the buffer pointer to release * @size: the allocated buffer size * * Releases the buffer allocated via snd_malloc_pages(). */ void snd_free_pages(void *ptr, size_t size) { int pg; if (ptr == NULL) return; pg = get_order(size); free_pages((unsigned long) ptr, pg); } EXPORT_SYMBOL(snd_free_pages); /* * * Bus-specific memory allocators * */ #ifdef CONFIG_HAS_DMA /* allocate the coherent DMA pages */ static void *snd_malloc_dev_pages(struct device *dev, size_t size, dma_addr_t *dma) { int pg; gfp_t gfp_flags; if (WARN_ON(!dma)) return NULL; pg = get_order(size); gfp_flags = GFP_KERNEL | __GFP_COMP /* compound page lets parts be mapped */ | __GFP_NORETRY /* don't trigger OOM-killer */ | __GFP_NOWARN; /* no stack trace print - this call is non-critical */ return dma_alloc_coherent(dev, PAGE_SIZE << pg, dma, gfp_flags); } /* free the coherent DMA pages */ static void snd_free_dev_pages(struct device *dev, size_t size, void *ptr, dma_addr_t dma) { int pg; if (ptr == NULL) return; pg = get_order(size); dma_free_coherent(dev, PAGE_SIZE << pg, ptr, dma); } #ifdef CONFIG_GENERIC_ALLOCATOR /** * snd_malloc_dev_iram - allocate memory from on-chip internal ram * @dmab: buffer allocation record to store the allocated data * @size: number of bytes to allocate from the iram * * This function requires iram phandle provided via of_node */ static void snd_malloc_dev_iram(struct snd_dma_buffer *dmab, size_t size) { struct device *dev = dmab->dev.dev; struct gen_pool *pool = NULL; dmab->area = NULL; dmab->addr = 0; if (dev->of_node) pool = of_gen_pool_get(dev->of_node, "iram", 0); if (!pool) return; /* Assign the pool into private_data field */ dmab->private_data = pool; dmab->area = gen_pool_dma_alloc(pool, size, &dmab->addr); } /** * snd_free_dev_iram - free allocated specific memory from on-chip internal ram * @dmab: buffer allocation record to store the allocated data */ static void snd_free_dev_iram(struct snd_dma_buffer *dmab) { struct gen_pool *pool = dmab->private_data; if (pool && dmab->area) gen_pool_free(pool, (unsigned long)dmab->area, dmab->bytes); } #endif /* CONFIG_GENERIC_ALLOCATOR */ #endif /* CONFIG_HAS_DMA */ /* * * ALSA generic memory management * */ /** * snd_dma_alloc_pages - allocate the buffer area according to the given type * @type: the DMA buffer type * @device: the device pointer * @size: the buffer size to allocate * @dmab: buffer allocation record to store the allocated data * * Calls the memory-allocator function for the corresponding * buffer type. * * Return: Zero if the buffer with the given size is allocated successfully, * otherwise a negative value on error. */ int snd_dma_alloc_pages(int type, struct device *device, size_t size, struct snd_dma_buffer *dmab) { if (WARN_ON(!size)) return -ENXIO; if (WARN_ON(!dmab)) return -ENXIO; size = PAGE_ALIGN(size); dmab->dev.type = type; dmab->dev.dev = device; dmab->bytes = 0; switch (type) { case SNDRV_DMA_TYPE_CONTINUOUS: dmab->area = snd_malloc_pages(size, (__force gfp_t)(unsigned long)device); dmab->addr = 0; break; #ifdef CONFIG_HAS_DMA #ifdef CONFIG_GENERIC_ALLOCATOR case SNDRV_DMA_TYPE_DEV_IRAM: snd_malloc_dev_iram(dmab, size); if (dmab->area) break; /* Internal memory might have limited size and no enough space, * so if we fail to malloc, try to fetch memory traditionally. */ dmab->dev.type = SNDRV_DMA_TYPE_DEV; #endif /* CONFIG_GENERIC_ALLOCATOR */ case SNDRV_DMA_TYPE_DEV: dmab->area = snd_malloc_dev_pages(device, size, &dmab->addr); break; #endif #ifdef CONFIG_SND_DMA_SGBUF case SNDRV_DMA_TYPE_DEV_SG: snd_malloc_sgbuf_pages(device, size, dmab, NULL); break; #endif default: pr_err("snd-malloc: invalid device type %d\n", type); dmab->area = NULL; dmab->addr = 0; return -ENXIO; } if (! dmab->area) return -ENOMEM; dmab->bytes = size; return 0; } EXPORT_SYMBOL(snd_dma_alloc_pages); /** * snd_dma_alloc_pages_fallback - allocate the buffer area according to the given type with fallback * @type: the DMA buffer type * @device: the device pointer * @size: the buffer size to allocate * @dmab: buffer allocation record to store the allocated data * * Calls the memory-allocator function for the corresponding * buffer type. When no space is left, this function reduces the size and * tries to allocate again. The size actually allocated is stored in * res_size argument. * * Return: Zero if the buffer with the given size is allocated successfully, * otherwise a negative value on error. */ int snd_dma_alloc_pages_fallback(int type, struct device *device, size_t size, struct snd_dma_buffer *dmab) { int err; while ((err = snd_dma_alloc_pages(type, device, size, dmab)) < 0) { if (err != -ENOMEM) return err; if (size <= PAGE_SIZE) return -ENOMEM; size >>= 1; size = PAGE_SIZE << get_order(size); } if (! dmab->area) return -ENOMEM; return 0; } EXPORT_SYMBOL(snd_dma_alloc_pages_fallback); /** * snd_dma_free_pages - release the allocated buffer * @dmab: the buffer allocation record to release * * Releases the allocated buffer via snd_dma_alloc_pages(). */ void snd_dma_free_pages(struct snd_dma_buffer *dmab) { switch (dmab->dev.type) { case SNDRV_DMA_TYPE_CONTINUOUS: snd_free_pages(dmab->area, dmab->bytes); break; #ifdef CONFIG_HAS_DMA #ifdef CONFIG_GENERIC_ALLOCATOR case SNDRV_DMA_TYPE_DEV_IRAM: snd_free_dev_iram(dmab); break; #endif /* CONFIG_GENERIC_ALLOCATOR */ case SNDRV_DMA_TYPE_DEV: snd_free_dev_pages(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); break; #endif #ifdef CONFIG_SND_DMA_SGBUF case SNDRV_DMA_TYPE_DEV_SG: snd_free_sgbuf_pages(dmab); break; #endif default: pr_err("snd-malloc: invalid device type %d\n", dmab->dev.type); } } EXPORT_SYMBOL(snd_dma_free_pages); |
38 38 10 6 757 38 38 10 10 38 38 38 32 10 757 1 69 68 69 69 69 69 19 41 41 40 40 41 1 96 11 3 3 1 3 3 3 3 2 3 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 | /* * drivers/base/power/wakeup.c - System wakeup events framework * * Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * * This file is released under the GPLv2. */ #include <linux/device.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/capability.h> #include <linux/export.h> #include <linux/suspend.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/pm_wakeirq.h> #include <trace/events/power.h> #include "power.h" #ifndef CONFIG_SUSPEND suspend_state_t pm_suspend_target_state; #define pm_suspend_target_state (PM_SUSPEND_ON) #endif /* * If set, the suspend/hibernate code will abort transitions to a sleep state * if wakeup events are registered during or immediately before the transition. */ bool events_check_enabled __read_mostly; /* First wakeup IRQ seen by the kernel in the last cycle. */ unsigned int pm_wakeup_irq __read_mostly; /* If greater than 0 and the system is suspending, terminate the suspend. */ static atomic_t pm_abort_suspend __read_mostly; /* * Combined counters of registered wakeup events and wakeup events in progress. * They need to be modified together atomically, so it's better to use one * atomic variable to hold them both. */ static atomic_t combined_event_count = ATOMIC_INIT(0); #define IN_PROGRESS_BITS (sizeof(int) * 4) #define MAX_IN_PROGRESS ((1 << IN_PROGRESS_BITS) - 1) static void split_counters(unsigned int *cnt, unsigned int *inpr) { unsigned int comb = atomic_read(&combined_event_count); *cnt = (comb >> IN_PROGRESS_BITS); *inpr = comb & MAX_IN_PROGRESS; } /* A preserved old value of the events counter. */ static unsigned int saved_count; static DEFINE_RAW_SPINLOCK(events_lock); static void pm_wakeup_timer_fn(struct timer_list *t); static LIST_HEAD(wakeup_sources); static DECLARE_WAIT_QUEUE_HEAD(wakeup_count_wait_queue); DEFINE_STATIC_SRCU(wakeup_srcu); static struct wakeup_source deleted_ws = { .name = "deleted", .lock = __SPIN_LOCK_UNLOCKED(deleted_ws.lock), }; /** * wakeup_source_prepare - Prepare a new wakeup source for initialization. * @ws: Wakeup source to prepare. * @name: Pointer to the name of the new wakeup source. * * Callers must ensure that the @name string won't be freed when @ws is still in * use. */ void wakeup_source_prepare(struct wakeup_source *ws, const char *name) { if (ws) { memset(ws, 0, sizeof(*ws)); ws->name = name; } } EXPORT_SYMBOL_GPL(wakeup_source_prepare); /** * wakeup_source_create - Create a struct wakeup_source object. * @name: Name of the new wakeup source. */ struct wakeup_source *wakeup_source_create(const char *name) { struct wakeup_source *ws; ws = kmalloc(sizeof(*ws), GFP_KERNEL); if (!ws) return NULL; wakeup_source_prepare(ws, name ? kstrdup_const(name, GFP_KERNEL) : NULL); return ws; } EXPORT_SYMBOL_GPL(wakeup_source_create); /** * wakeup_source_drop - Prepare a struct wakeup_source object for destruction. * @ws: Wakeup source to prepare for destruction. * * Callers must ensure that __pm_stay_awake() or __pm_wakeup_event() will never * be run in parallel with this function for the same wakeup source object. */ void wakeup_source_drop(struct wakeup_source *ws) { if (!ws) return; __pm_relax(ws); } EXPORT_SYMBOL_GPL(wakeup_source_drop); /* * Record wakeup_source statistics being deleted into a dummy wakeup_source. */ static void wakeup_source_record(struct wakeup_source *ws) { unsigned long flags; spin_lock_irqsave(&deleted_ws.lock, flags); if (ws->event_count) { deleted_ws.total_time = ktime_add(deleted_ws.total_time, ws->total_time); deleted_ws.prevent_sleep_time = ktime_add(deleted_ws.prevent_sleep_time, ws->prevent_sleep_time); deleted_ws.max_time = ktime_compare(deleted_ws.max_time, ws->max_time) > 0 ? deleted_ws.max_time : ws->max_time; deleted_ws.event_count += ws->event_count; deleted_ws.active_count += ws->active_count; deleted_ws.relax_count += ws->relax_count; deleted_ws.expire_count += ws->expire_count; deleted_ws.wakeup_count += ws->wakeup_count; } spin_unlock_irqrestore(&deleted_ws.lock, flags); } /** * wakeup_source_destroy - Destroy a struct wakeup_source object. * @ws: Wakeup source to destroy. * * Use only for wakeup source objects created with wakeup_source_create(). */ void wakeup_source_destroy(struct wakeup_source *ws) { if (!ws) return; wakeup_source_drop(ws); wakeup_source_record(ws); kfree_const(ws->name); kfree(ws); } EXPORT_SYMBOL_GPL(wakeup_source_destroy); /** * wakeup_source_add - Add given object to the list of wakeup sources. * @ws: Wakeup source object to add to the list. */ void wakeup_source_add(struct wakeup_source *ws) { unsigned long flags; if (WARN_ON(!ws)) return; spin_lock_init(&ws->lock); timer_setup(&ws->timer, pm_wakeup_timer_fn, 0); ws->active = false; raw_spin_lock_irqsave(&events_lock, flags); list_add_rcu(&ws->entry, &wakeup_sources); raw_spin_unlock_irqrestore(&events_lock, flags); } EXPORT_SYMBOL_GPL(wakeup_source_add); /** * wakeup_source_remove - Remove given object from the wakeup sources list. * @ws: Wakeup source object to remove from the list. */ void wakeup_source_remove(struct wakeup_source *ws) { unsigned long flags; if (WARN_ON(!ws)) return; raw_spin_lock_irqsave(&events_lock, flags); list_del_rcu(&ws->entry); raw_spin_unlock_irqrestore(&events_lock, flags); synchronize_srcu(&wakeup_srcu); del_timer_sync(&ws->timer); /* * Clear timer.function to make wakeup_source_not_registered() treat * this wakeup source as not registered. */ ws->timer.function = NULL; } EXPORT_SYMBOL_GPL(wakeup_source_remove); /** * wakeup_source_register - Create wakeup source and add it to the list. * @name: Name of the wakeup source to register. */ struct wakeup_source *wakeup_source_register(const char *name) { struct wakeup_source *ws; ws = wakeup_source_create(name); if (ws) wakeup_source_add(ws); return ws; } EXPORT_SYMBOL_GPL(wakeup_source_register); /** * wakeup_source_unregister - Remove wakeup source from the list and remove it. * @ws: Wakeup source object to unregister. */ void wakeup_source_unregister(struct wakeup_source *ws) { if (ws) { wakeup_source_remove(ws); wakeup_source_destroy(ws); } } EXPORT_SYMBOL_GPL(wakeup_source_unregister); /** * device_wakeup_attach - Attach a wakeup source object to a device object. * @dev: Device to handle. * @ws: Wakeup source object to attach to @dev. * * This causes @dev to be treated as a wakeup device. */ static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws) { spin_lock_irq(&dev->power.lock); if (dev->power.wakeup) { spin_unlock_irq(&dev->power.lock); return -EEXIST; } dev->power.wakeup = ws; if (dev->power.wakeirq) device_wakeup_attach_irq(dev, dev->power.wakeirq); spin_unlock_irq(&dev->power.lock); return 0; } /** * device_wakeup_enable - Enable given device to be a wakeup source. * @dev: Device to handle. * * Create a wakeup source object, register it and attach it to @dev. */ int device_wakeup_enable(struct device *dev) { struct wakeup_source *ws; int ret; if (!dev || !dev->power.can_wakeup) return -EINVAL; if (pm_suspend_target_state != PM_SUSPEND_ON) dev_dbg(dev, "Suspicious %s() during system transition!\n", __func__); ws = wakeup_source_register(dev_name(dev)); if (!ws) return -ENOMEM; ret = device_wakeup_attach(dev, ws); if (ret) wakeup_source_unregister(ws); return ret; } EXPORT_SYMBOL_GPL(device_wakeup_enable); /** * device_wakeup_attach_irq - Attach a wakeirq to a wakeup source * @dev: Device to handle * @wakeirq: Device specific wakeirq entry * * Attach a device wakeirq to the wakeup source so the device * wake IRQ can be configured automatically for suspend and * resume. * * Call under the device's power.lock lock. */ void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq) { struct wakeup_source *ws; ws = dev->power.wakeup; if (!ws) return; if (ws->wakeirq) dev_err(dev, "Leftover wakeup IRQ found, overriding\n"); ws->wakeirq = wakeirq; } /** * device_wakeup_detach_irq - Detach a wakeirq from a wakeup source * @dev: Device to handle * * Removes a device wakeirq from the wakeup source. * * Call under the device's power.lock lock. */ void device_wakeup_detach_irq(struct device *dev) { struct wakeup_source *ws; ws = dev->power.wakeup; if (ws) ws->wakeirq = NULL; } /** * device_wakeup_arm_wake_irqs(void) * * Itereates over the list of device wakeirqs to arm them. */ void device_wakeup_arm_wake_irqs(void) { struct wakeup_source *ws; int srcuidx; srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu(ws, &wakeup_sources, entry) dev_pm_arm_wake_irq(ws->wakeirq); srcu_read_unlock(&wakeup_srcu, srcuidx); } /** * device_wakeup_disarm_wake_irqs(void) * * Itereates over the list of device wakeirqs to disarm them. */ void device_wakeup_disarm_wake_irqs(void) { struct wakeup_source *ws; int srcuidx; srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu(ws, &wakeup_sources, entry) dev_pm_disarm_wake_irq(ws->wakeirq); srcu_read_unlock(&wakeup_srcu, srcuidx); } /** * device_wakeup_detach - Detach a device's wakeup source object from it. * @dev: Device to detach the wakeup source object from. * * After it returns, @dev will not be treated as a wakeup device any more. */ static struct wakeup_source *device_wakeup_detach(struct device *dev) { struct wakeup_source *ws; spin_lock_irq(&dev->power.lock); ws = dev->power.wakeup; dev->power.wakeup = NULL; spin_unlock_irq(&dev->power.lock); return ws; } /** * device_wakeup_disable - Do not regard a device as a wakeup source any more. * @dev: Device to handle. * * Detach the @dev's wakeup source object from it, unregister this wakeup source * object and destroy it. */ int device_wakeup_disable(struct device *dev) { struct wakeup_source *ws; if (!dev || !dev->power.can_wakeup) return -EINVAL; ws = device_wakeup_detach(dev); wakeup_source_unregister(ws); return 0; } EXPORT_SYMBOL_GPL(device_wakeup_disable); /** * device_set_wakeup_capable - Set/reset device wakeup capability flag. * @dev: Device to handle. * @capable: Whether or not @dev is capable of waking up the system from sleep. * * If @capable is set, set the @dev's power.can_wakeup flag and add its * wakeup-related attributes to sysfs. Otherwise, unset the @dev's * power.can_wakeup flag and remove its wakeup-related attributes from sysfs. * * This function may sleep and it can't be called from any context where * sleeping is not allowed. */ void device_set_wakeup_capable(struct device *dev, bool capable) { if (!!dev->power.can_wakeup == !!capable) return; dev->power.can_wakeup = capable; if (device_is_registered(dev) && !list_empty(&dev->power.entry)) { if (capable) { int ret = wakeup_sysfs_add(dev); if (ret) dev_info(dev, "Wakeup sysfs attributes not added\n"); } else { wakeup_sysfs_remove(dev); } } } EXPORT_SYMBOL_GPL(device_set_wakeup_capable); /** * device_init_wakeup - Device wakeup initialization. * @dev: Device to handle. * @enable: Whether or not to enable @dev as a wakeup device. * * By default, most devices should leave wakeup disabled. The exceptions are * devices that everyone expects to be wakeup sources: keyboards, power buttons, * possibly network interfaces, etc. Also, devices that don't generate their * own wakeup requests but merely forward requests from one bus to another * (like PCI bridges) should have wakeup enabled by default. */ int device_init_wakeup(struct device *dev, bool enable) { int ret = 0; if (!dev) return -EINVAL; if (enable) { device_set_wakeup_capable(dev, true); ret = device_wakeup_enable(dev); } else { device_wakeup_disable(dev); device_set_wakeup_capable(dev, false); } return ret; } EXPORT_SYMBOL_GPL(device_init_wakeup); /** * device_set_wakeup_enable - Enable or disable a device to wake up the system. * @dev: Device to handle. */ int device_set_wakeup_enable(struct device *dev, bool enable) { return enable ? device_wakeup_enable(dev) : device_wakeup_disable(dev); } EXPORT_SYMBOL_GPL(device_set_wakeup_enable); /** * wakeup_source_not_registered - validate the given wakeup source. * @ws: Wakeup source to be validated. */ static bool wakeup_source_not_registered(struct wakeup_source *ws) { /* * Use timer struct to check if the given source is initialized * by wakeup_source_add. */ return ws->timer.function != pm_wakeup_timer_fn; } /* * The functions below use the observation that each wakeup event starts a * period in which the system should not be suspended. The moment this period * will end depends on how the wakeup event is going to be processed after being * detected and all of the possible cases can be divided into two distinct * groups. * * First, a wakeup event may be detected by the same functional unit that will * carry out the entire processing of it and possibly will pass it to user space * for further processing. In that case the functional unit that has detected * the event may later "close" the "no suspend" period associated with it * directly as soon as it has been dealt with. The pair of pm_stay_awake() and * pm_relax(), balanced with each other, is supposed to be used in such * situations. * * Second, a wakeup event may be detected by one functional unit and processed * by another one. In that case the unit that has detected it cannot really * "close" the "no suspend" period associated with it, unless it knows in * advance what's going to happen to the event during processing. This * knowledge, however, may not be available to it, so it can simply specify time * to wait before the system can be suspended and pass it as the second * argument of pm_wakeup_event(). * * It is valid to call pm_relax() after pm_wakeup_event(), in which case the * "no suspend" period will be ended either by the pm_relax(), or by the timer * function executed when the timer expires, whichever comes first. */ /** * wakup_source_activate - Mark given wakeup source as active. * @ws: Wakeup source to handle. * * Update the @ws' statistics and, if @ws has just been activated, notify the PM * core of the event by incrementing the counter of of wakeup events being * processed. */ static void wakeup_source_activate(struct wakeup_source *ws) { unsigned int cec; if (WARN_ONCE(wakeup_source_not_registered(ws), "unregistered wakeup source\n")) return; ws->active = true; ws->active_count++; ws->last_time = ktime_get(); if (ws->autosleep_enabled) ws->start_prevent_time = ws->last_time; /* Increment the counter of events in progress. */ cec = atomic_inc_return(&combined_event_count); trace_wakeup_source_activate(ws->name, cec); } /** * wakeup_source_report_event - Report wakeup event using the given source. * @ws: Wakeup source to report the event for. * @hard: If set, abort suspends in progress and wake up from suspend-to-idle. */ static void wakeup_source_report_event(struct wakeup_source *ws, bool hard) { ws->event_count++; /* This is racy, but the counter is approximate anyway. */ if (events_check_enabled) ws->wakeup_count++; if (!ws->active) wakeup_source_activate(ws); if (hard) pm_system_wakeup(); } /** * __pm_stay_awake - Notify the PM core of a wakeup event. * @ws: Wakeup source object associated with the source of the event. * * It is safe to call this function from interrupt context. */ void __pm_stay_awake(struct wakeup_source *ws) { unsigned long flags; if (!ws) return; spin_lock_irqsave(&ws->lock, flags); wakeup_source_report_event(ws, false); del_timer(&ws->timer); ws->timer_expires = 0; spin_unlock_irqrestore(&ws->lock, flags); } EXPORT_SYMBOL_GPL(__pm_stay_awake); /** * pm_stay_awake - Notify the PM core that a wakeup event is being processed. * @dev: Device the wakeup event is related to. * * Notify the PM core of a wakeup event (signaled by @dev) by calling * __pm_stay_awake for the @dev's wakeup source object. * * Call this function after detecting of a wakeup event if pm_relax() is going * to be called directly after processing the event (and possibly passing it to * user space for further processing). */ void pm_stay_awake(struct device *dev) { unsigned long flags; if (!dev) return; spin_lock_irqsave(&dev->power.lock, flags); __pm_stay_awake(dev->power.wakeup); spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_stay_awake); #ifdef CONFIG_PM_AUTOSLEEP static void update_prevent_sleep_time(struct wakeup_source *ws, ktime_t now) { ktime_t delta = ktime_sub(now, ws->start_prevent_time); ws->prevent_sleep_time = ktime_add(ws->prevent_sleep_time, delta); } #else static inline void update_prevent_sleep_time(struct wakeup_source *ws, ktime_t now) {} #endif /** * wakup_source_deactivate - Mark given wakeup source as inactive. * @ws: Wakeup source to handle. * * Update the @ws' statistics and notify the PM core that the wakeup source has * become inactive by decrementing the counter of wakeup events being processed * and incrementing the counter of registered wakeup events. */ static void wakeup_source_deactivate(struct wakeup_source *ws) { unsigned int cnt, inpr, cec; ktime_t duration; ktime_t now; ws->relax_count++; /* * __pm_relax() may be called directly or from a timer function. * If it is called directly right after the timer function has been * started, but before the timer function calls __pm_relax(), it is * possible that __pm_stay_awake() will be called in the meantime and * will set ws->active. Then, ws->active may be cleared immediately * by the __pm_relax() called from the timer function, but in such a * case ws->relax_count will be different from ws->active_count. */ if (ws->relax_count != ws->active_count) { ws->relax_count--; return; } ws->active = false; now = ktime_get(); duration = ktime_sub(now, ws->last_time); ws->total_time = ktime_add(ws->total_time, duration); if (ktime_to_ns(duration) > ktime_to_ns(ws->max_time)) ws->max_time = duration; ws->last_time = now; del_timer(&ws->timer); ws->timer_expires = 0; if (ws->autosleep_enabled) update_prevent_sleep_time(ws, now); /* * Increment the counter of registered wakeup events and decrement the * couter of wakeup events in progress simultaneously. */ cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count); trace_wakeup_source_deactivate(ws->name, cec); split_counters(&cnt, &inpr); if (!inpr && waitqueue_active(&wakeup_count_wait_queue)) wake_up(&wakeup_count_wait_queue); } /** * __pm_relax - Notify the PM core that processing of a wakeup event has ended. * @ws: Wakeup source object associated with the source of the event. * * Call this function for wakeup events whose processing started with calling * __pm_stay_awake(). * * It is safe to call it from interrupt context. */ void __pm_relax(struct wakeup_source *ws) { unsigned long flags; if (!ws) return; spin_lock_irqsave(&ws->lock, flags); if (ws->active) wakeup_source_deactivate(ws); spin_unlock_irqrestore(&ws->lock, flags); } EXPORT_SYMBOL_GPL(__pm_relax); /** * pm_relax - Notify the PM core that processing of a wakeup event has ended. * @dev: Device that signaled the event. * * Execute __pm_relax() for the @dev's wakeup source object. */ void pm_relax(struct device *dev) { unsigned long flags; if (!dev) return; spin_lock_irqsave(&dev->power.lock, flags); __pm_relax(dev->power.wakeup); spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_relax); /** * pm_wakeup_timer_fn - Delayed finalization of a wakeup event. * @data: Address of the wakeup source object associated with the event source. * * Call wakeup_source_deactivate() for the wakeup source whose address is stored * in @data if it is currently active and its timer has not been canceled and * the expiration time of the timer is not in future. */ static void pm_wakeup_timer_fn(struct timer_list *t) { struct wakeup_source *ws = from_timer(ws, t, timer); unsigned long flags; spin_lock_irqsave(&ws->lock, flags); if (ws->active && ws->timer_expires && time_after_eq(jiffies, ws->timer_expires)) { wakeup_source_deactivate(ws); ws->expire_count++; } spin_unlock_irqrestore(&ws->lock, flags); } /** * pm_wakeup_ws_event - Notify the PM core of a wakeup event. * @ws: Wakeup source object associated with the event source. * @msec: Anticipated event processing time (in milliseconds). * @hard: If set, abort suspends in progress and wake up from suspend-to-idle. * * Notify the PM core of a wakeup event whose source is @ws that will take * approximately @msec milliseconds to be processed by the kernel. If @ws is * not active, activate it. If @msec is nonzero, set up the @ws' timer to * execute pm_wakeup_timer_fn() in future. * * It is safe to call this function from interrupt context. */ void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard) { unsigned long flags; unsigned long expires; if (!ws) return; spin_lock_irqsave(&ws->lock, flags); wakeup_source_report_event(ws, hard); if (!msec) { wakeup_source_deactivate(ws); goto unlock; } expires = jiffies + msecs_to_jiffies(msec); if (!expires) expires = 1; if (!ws->timer_expires || time_after(expires, ws->timer_expires)) { mod_timer(&ws->timer, expires); ws->timer_expires = expires; } unlock: spin_unlock_irqrestore(&ws->lock, flags); } EXPORT_SYMBOL_GPL(pm_wakeup_ws_event); /** * pm_wakeup_event - Notify the PM core of a wakeup event. * @dev: Device the wakeup event is related to. * @msec: Anticipated event processing time (in milliseconds). * @hard: If set, abort suspends in progress and wake up from suspend-to-idle. * * Call pm_wakeup_ws_event() for the @dev's wakeup source object. */ void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard) { unsigned long flags; if (!dev) return; spin_lock_irqsave(&dev->power.lock, flags); pm_wakeup_ws_event(dev->power.wakeup, msec, hard); spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_wakeup_dev_event); void pm_print_active_wakeup_sources(void) { struct wakeup_source *ws; int srcuidx, active = 0; struct wakeup_source *last_activity_ws = NULL; srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu(ws, &wakeup_sources, entry) { if (ws->active) { pr_debug("active wakeup source: %s\n", ws->name); active = 1; } else if (!active && (!last_activity_ws || ktime_to_ns(ws->last_time) > ktime_to_ns(last_activity_ws->last_time))) { last_activity_ws = ws; } } if (!active && last_activity_ws) pr_debug("last active wakeup source: %s\n", last_activity_ws->name); srcu_read_unlock(&wakeup_srcu, srcuidx); } EXPORT_SYMBOL_GPL(pm_print_active_wakeup_sources); /** * pm_wakeup_pending - Check if power transition in progress should be aborted. * * Compare the current number of registered wakeup events with its preserved * value from the past and return true if new wakeup events have been registered * since the old value was stored. Also return true if the current number of * wakeup events being processed is different from zero. */ bool pm_wakeup_pending(void) { unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&events_lock, flags); if (events_check_enabled) { unsigned int cnt, inpr; split_counters(&cnt, &inpr); ret = (cnt != saved_count || inpr > 0); events_check_enabled = !ret; } raw_spin_unlock_irqrestore(&events_lock, flags); if (ret) { pr_debug("PM: Wakeup pending, aborting suspend\n"); pm_print_active_wakeup_sources(); } return ret || atomic_read(&pm_abort_suspend) > 0; } void pm_system_wakeup(void) { atomic_inc(&pm_abort_suspend); s2idle_wake(); } EXPORT_SYMBOL_GPL(pm_system_wakeup); void pm_system_cancel_wakeup(void) { atomic_dec_if_positive(&pm_abort_suspend); } void pm_wakeup_clear(bool reset) { pm_wakeup_irq = 0; if (reset) atomic_set(&pm_abort_suspend, 0); } void pm_system_irq_wakeup(unsigned int irq_number) { if (pm_wakeup_irq == 0) { pm_wakeup_irq = irq_number; pm_system_wakeup(); } } /** * pm_get_wakeup_count - Read the number of registered wakeup events. * @count: Address to store the value at. * @block: Whether or not to block. * * Store the number of registered wakeup events at the address in @count. If * @block is set, block until the current number of wakeup events being * processed is zero. * * Return 'false' if the current number of wakeup events being processed is * nonzero. Otherwise return 'true'. */ bool pm_get_wakeup_count(unsigned int *count, bool block) { unsigned int cnt, inpr; if (block) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait(&wakeup_count_wait_queue, &wait, TASK_INTERRUPTIBLE); split_counters(&cnt, &inpr); if (inpr == 0 || signal_pending(current)) break; pm_print_active_wakeup_sources(); schedule(); } finish_wait(&wakeup_count_wait_queue, &wait); } split_counters(&cnt, &inpr); *count = cnt; return !inpr; } /** * pm_save_wakeup_count - Save the current number of registered wakeup events. * @count: Value to compare with the current number of registered wakeup events. * * If @count is equal to the current number of registered wakeup events and the * current number of wakeup events being processed is zero, store @count as the * old number of registered wakeup events for pm_check_wakeup_events(), enable * wakeup events detection and return 'true'. Otherwise disable wakeup events * detection and return 'false'. */ bool pm_save_wakeup_count(unsigned int count) { unsigned int cnt, inpr; unsigned long flags; events_check_enabled = false; raw_spin_lock_irqsave(&events_lock, flags); split_counters(&cnt, &inpr); if (cnt == count && inpr == 0) { saved_count = count; events_check_enabled = true; } raw_spin_unlock_irqrestore(&events_lock, flags); return events_check_enabled; } #ifdef CONFIG_PM_AUTOSLEEP /** * pm_wakep_autosleep_enabled - Modify autosleep_enabled for all wakeup sources. * @enabled: Whether to set or to clear the autosleep_enabled flags. */ void pm_wakep_autosleep_enabled(bool set) { struct wakeup_source *ws; ktime_t now = ktime_get(); int srcuidx; srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu(ws, &wakeup_sources, entry) { spin_lock_irq(&ws->lock); if (ws->autosleep_enabled != set) { ws->autosleep_enabled = set; if (ws->active) { if (set) ws->start_prevent_time = now; else update_prevent_sleep_time(ws, now); } } spin_unlock_irq(&ws->lock); } srcu_read_unlock(&wakeup_srcu, srcuidx); } #endif /* CONFIG_PM_AUTOSLEEP */ static struct dentry *wakeup_sources_stats_dentry; /** * print_wakeup_source_stats - Print wakeup source statistics information. * @m: seq_file to print the statistics into. * @ws: Wakeup source object to print the statistics for. */ static int print_wakeup_source_stats(struct seq_file *m, struct wakeup_source *ws) { unsigned long flags; ktime_t total_time; ktime_t max_time; unsigned long active_count; ktime_t active_time; ktime_t prevent_sleep_time; spin_lock_irqsave(&ws->lock, flags); total_time = ws->total_time; max_time = ws->max_time; prevent_sleep_time = ws->prevent_sleep_time; active_count = ws->active_count; if (ws->active) { ktime_t now = ktime_get(); active_time = ktime_sub(now, ws->last_time); total_time = ktime_add(total_time, active_time); if (active_time > max_time) max_time = active_time; if (ws->autosleep_enabled) prevent_sleep_time = ktime_add(prevent_sleep_time, ktime_sub(now, ws->start_prevent_time)); } else { active_time = 0; } seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n", ws->name, active_count, ws->event_count, ws->wakeup_count, ws->expire_count, ktime_to_ms(active_time), ktime_to_ms(total_time), ktime_to_ms(max_time), ktime_to_ms(ws->last_time), ktime_to_ms(prevent_sleep_time)); spin_unlock_irqrestore(&ws->lock, flags); return 0; } static void *wakeup_sources_stats_seq_start(struct seq_file *m, loff_t *pos) { struct wakeup_source *ws; loff_t n = *pos; int *srcuidx = m->private; if (n == 0) { seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t" "expire_count\tactive_since\ttotal_time\tmax_time\t" "last_change\tprevent_suspend_time\n"); } *srcuidx = srcu_read_lock(&wakeup_srcu); list_for_each_entry_rcu(ws, &wakeup_sources, entry) { if (n-- <= 0) return ws; } return NULL; } static void *wakeup_sources_stats_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct wakeup_source *ws = v; struct wakeup_source *next_ws = NULL; ++(*pos); list_for_each_entry_continue_rcu(ws, &wakeup_sources, entry) { next_ws = ws; break; } return next_ws; } static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v) { int *srcuidx = m->private; srcu_read_unlock(&wakeup_srcu, *srcuidx); } /** * wakeup_sources_stats_seq_show - Print wakeup sources statistics information. * @m: seq_file to print the statistics into. * @v: wakeup_source of each iteration */ static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v) { struct wakeup_source *ws = v; print_wakeup_source_stats(m, ws); return 0; } static const struct seq_operations wakeup_sources_stats_seq_ops = { .start = wakeup_sources_stats_seq_start, .next = wakeup_sources_stats_seq_next, .stop = wakeup_sources_stats_seq_stop, .show = wakeup_sources_stats_seq_show, }; static int wakeup_sources_stats_open(struct inode *inode, struct file *file) { return seq_open_private(file, &wakeup_sources_stats_seq_ops, sizeof(int)); } static const struct file_operations wakeup_sources_stats_fops = { .owner = THIS_MODULE, .open = wakeup_sources_stats_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; static int __init wakeup_sources_debugfs_init(void) { wakeup_sources_stats_dentry = debugfs_create_file("wakeup_sources", S_IRUGO, NULL, NULL, &wakeup_sources_stats_fops); return 0; } postcore_initcall(wakeup_sources_debugfs_init); |
20 30 30 20 13 13 13 13 20 13 20 20 20 20 20 20 20 20 30 30 30 30 33 33 43 43 34 11 3 2 8 11 8 8 8 43 43 41 32 13 34 34 33 33 3 2 2 2 1 33 30 30 30 30 43 13 30 30 10 30 30 17 30 30 20 30 30 24 30 30 30 30 30 30 20 20 20 20 30 30 30 30 30 30 30 20 20 20 20 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 | // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/crc32c.h> #include <linux/ctype.h> #include <linux/highmem.h> #include <linux/inet.h> #include <linux/kthread.h> #include <linux/net.h> #include <linux/nsproxy.h> #include <linux/sched/mm.h> #include <linux/slab.h> #include <linux/socket.h> #include <linux/string.h> #ifdef CONFIG_BLOCK #include <linux/bio.h> #endif /* CONFIG_BLOCK */ #include <linux/dns_resolver.h> #include <net/tcp.h> #include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/messenger.h> #include <linux/ceph/decode.h> #include <linux/ceph/pagelist.h> #include <linux/export.h> /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable * delivery. We tolerate TCP disconnects by reconnecting (with * exponential backoff) in the case of a fault (disconnection, bad * crc, protocol error). Acks allow sent messages to be discarded by * the sender. */ /* * We track the state of the socket on a given connection using * values defined below. The transition to a new socket state is * handled by a function which verifies we aren't coming from an * unexpected state. * * -------- * | NEW* | transient initial state * -------- * | con_sock_state_init() * v * ---------- * | CLOSED | initialized, but no socket (and no * ---------- TCP connection) * ^ \ * | \ con_sock_state_connecting() * | ---------------------- * | \ * + con_sock_state_closed() \ * |+--------------------------- \ * | \ \ \ * | ----------- \ \ * | | CLOSING | socket event; \ \ * | ----------- await close \ \ * | ^ \ | * | | \ | * | + con_sock_state_closing() \ | * | / \ | | * | / --------------- | | * | / \ v v * | / -------------- * | / -----------------| CONNECTING | socket created, TCP * | | / -------------- connect initiated * | | | con_sock_state_connected() * | | v * ------------- * | CONNECTED | TCP connection established * ------------- * * State values for ceph_connection->sock_state; NEW is assumed to be 0. */ #define CON_SOCK_STATE_NEW 0 /* -> CLOSED */ #define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */ #define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */ #define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ #define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ /* * connection states */ #define CON_STATE_CLOSED 1 /* -> PREOPEN */ #define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */ #define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */ #define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */ #define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */ #define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */ /* * ceph_connection flag bits */ #define CON_FLAG_LOSSYTX 0 /* we can close channel or drop * messages on errors */ #define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ #define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */ #define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ #define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ static bool con_flag_valid(unsigned long con_flag) { switch (con_flag) { case CON_FLAG_LOSSYTX: case CON_FLAG_KEEPALIVE_PENDING: case CON_FLAG_WRITE_PENDING: case CON_FLAG_SOCK_CLOSED: case CON_FLAG_BACKOFF: return true; default: return false; } } static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); clear_bit(con_flag, &con->flags); } static void con_flag_set(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); set_bit(con_flag, &con->flags); } static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_bit(con_flag, &con->flags); } static bool con_flag_test_and_clear(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_and_clear_bit(con_flag, &con->flags); } static bool con_flag_test_and_set(struct ceph_connection *con, unsigned long con_flag) { BUG_ON(!con_flag_valid(con_flag)); return test_and_set_bit(con_flag, &con->flags); } /* Slab caches for frequently-allocated structures */ static struct kmem_cache *ceph_msg_cache; static struct kmem_cache *ceph_msg_data_cache; /* static tag bytes (protocol control messages) */ static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2; #ifdef CONFIG_LOCKDEP static struct lock_class_key socket_class; #endif static void queue_con(struct ceph_connection *con); static void cancel_con(struct ceph_connection *con); static void ceph_con_workfn(struct work_struct *); static void con_fault(struct ceph_connection *con); /* * Nicely render a sockaddr as a string. An array of formatted * strings is used, to approximate reentrancy. */ #define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */ #define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG) #define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1) #define MAX_ADDR_STR_LEN 64 /* 54 is enough */ static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; static atomic_t addr_str_seq = ATOMIC_INIT(0); static struct page *zero_page; /* used in certain error cases */ const char *ceph_pr_addr(const struct sockaddr_storage *ss) { int i; char *s; struct sockaddr_in *in4 = (struct sockaddr_in *) ss; struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; s = addr_str[i]; switch (ss->ss_family) { case AF_INET: snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr, ntohs(in4->sin_port)); break; case AF_INET6: snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr, ntohs(in6->sin6_port)); break; default: snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", ss->ss_family); } return s; } EXPORT_SYMBOL(ceph_pr_addr); static void encode_my_addr(struct ceph_messenger *msgr) { memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); ceph_encode_addr(&msgr->my_enc_addr); } /* * work queue for all reading and writing to/from the socket. */ static struct workqueue_struct *ceph_msgr_wq; static int ceph_msgr_slab_init(void) { BUG_ON(ceph_msg_cache); ceph_msg_cache = KMEM_CACHE(ceph_msg, 0); if (!ceph_msg_cache) return -ENOMEM; BUG_ON(ceph_msg_data_cache); ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0); if (ceph_msg_data_cache) return 0; kmem_cache_destroy(ceph_msg_cache); ceph_msg_cache = NULL; return -ENOMEM; } static void ceph_msgr_slab_exit(void) { BUG_ON(!ceph_msg_data_cache); kmem_cache_destroy(ceph_msg_data_cache); ceph_msg_data_cache = NULL; BUG_ON(!ceph_msg_cache); kmem_cache_destroy(ceph_msg_cache); ceph_msg_cache = NULL; } static void _ceph_msgr_exit(void) { if (ceph_msgr_wq) { destroy_workqueue(ceph_msgr_wq); ceph_msgr_wq = NULL; } BUG_ON(zero_page == NULL); put_page(zero_page); zero_page = NULL; ceph_msgr_slab_exit(); } int __init ceph_msgr_init(void) { if (ceph_msgr_slab_init()) return -ENOMEM; BUG_ON(zero_page != NULL); zero_page = ZERO_PAGE(0); get_page(zero_page); /* * The number of active work items is limited by the number of * connections, so leave @max_active at default. */ ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0); if (ceph_msgr_wq) return 0; pr_err("msgr_init failed to create workqueue\n"); _ceph_msgr_exit(); return -ENOMEM; } void ceph_msgr_exit(void) { BUG_ON(ceph_msgr_wq == NULL); _ceph_msgr_exit(); } void ceph_msgr_flush(void) { flush_workqueue(ceph_msgr_wq); } EXPORT_SYMBOL(ceph_msgr_flush); /* Connection socket state transition functions */ static void con_sock_state_init(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); if (WARN_ON(old_state != CON_SOCK_STATE_NEW)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CLOSED); } static void con_sock_state_connecting(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING); if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CONNECTING); } static void con_sock_state_connected(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CONNECTED); } static void con_sock_state_closing(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING && old_state != CON_SOCK_STATE_CONNECTED && old_state != CON_SOCK_STATE_CLOSING)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CLOSING); } static void con_sock_state_closed(struct ceph_connection *con) { int old_state; old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && old_state != CON_SOCK_STATE_CLOSING && old_state != CON_SOCK_STATE_CONNECTING && old_state != CON_SOCK_STATE_CLOSED)) printk("%s: unexpected old state %d\n", __func__, old_state); dout("%s con %p sock %d -> %d\n", __func__, con, old_state, CON_SOCK_STATE_CLOSED); } /* * socket callback functions */ /* data available on socket, or listen socket received a connect */ static void ceph_sock_data_ready(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; if (atomic_read(&con->msgr->stopping)) { return; } if (sk->sk_state != TCP_CLOSE_WAIT) { dout("%s on %p state = %lu, queueing work\n", __func__, con, con->state); queue_con(con); } } /* socket has buffer space for writing */ static void ceph_sock_write_space(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; /* only queue to workqueue if there is data we want to write, * and there is sufficient space in the socket buffer to accept * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() * doesn't get called again until try_write() fills the socket * buffer. See net/ipv4/tcp_input.c:tcp_check_space() * and net/core/stream.c:sk_stream_write_space(). */ if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) { if (sk_stream_is_writeable(sk)) { dout("%s %p queueing write work\n", __func__, con); clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); queue_con(con); } } else { dout("%s %p nothing to write\n", __func__, con); } } /* socket's state has changed */ static void ceph_sock_state_change(struct sock *sk) { struct ceph_connection *con = sk->sk_user_data; dout("%s %p state = %lu sk_state = %u\n", __func__, con, con->state, sk->sk_state); switch (sk->sk_state) { case TCP_CLOSE: dout("%s TCP_CLOSE\n", __func__); /* fall through */ case TCP_CLOSE_WAIT: dout("%s TCP_CLOSE_WAIT\n", __func__); con_sock_state_closing(con); con_flag_set(con, CON_FLAG_SOCK_CLOSED); queue_con(con); break; case TCP_ESTABLISHED: dout("%s TCP_ESTABLISHED\n", __func__); con_sock_state_connected(con); queue_con(con); break; default: /* Everything else is uninteresting */ break; } } /* * set up socket callbacks */ static void set_sock_callbacks(struct socket *sock, struct ceph_connection *con) { struct sock *sk = sock->sk; sk->sk_user_data = con; sk->sk_data_ready = ceph_sock_data_ready; sk->sk_write_space = ceph_sock_write_space; sk->sk_state_change = ceph_sock_state_change; } /* * socket helpers */ /* * initiate connection to a remote socket. */ static int ceph_tcp_connect(struct ceph_connection *con) { struct sockaddr_storage *paddr = &con->peer_addr.in_addr; struct socket *sock; unsigned int noio_flag; int ret; BUG_ON(con->sock); /* sock_create_kern() allocates with GFP_KERNEL */ noio_flag = memalloc_noio_save(); ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family, SOCK_STREAM, IPPROTO_TCP, &sock); memalloc_noio_restore(noio_flag); if (ret) return ret; sock->sk->sk_allocation = GFP_NOFS; #ifdef CONFIG_LOCKDEP lockdep_set_class(&sock->sk->sk_lock, &socket_class); #endif set_sock_callbacks(sock, con); dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); con_sock_state_connecting(con); ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", ceph_pr_addr(&con->peer_addr.in_addr), sock->sk->sk_state); } else if (ret < 0) { pr_err("connect %s error %d\n", ceph_pr_addr(&con->peer_addr.in_addr), ret); sock_release(sock); return ret; } if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) { int optval = 1; ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&optval, sizeof(optval)); if (ret) pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d", ret); } con->sock = sock; return 0; } /* * If @buf is NULL, discard up to @len bytes. */ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) { struct kvec iov = {buf, len}; struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; int r; if (!buf) msg.msg_flags |= MSG_TRUNC; iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, len); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) r = 0; return r; } static int ceph_tcp_recvpage(struct socket *sock, struct page *page, int page_offset, size_t length) { struct bio_vec bvec = { .bv_page = page, .bv_offset = page_offset, .bv_len = length }; struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; int r; BUG_ON(page_offset + length > PAGE_SIZE); iov_iter_bvec(&msg.msg_iter, READ | ITER_BVEC, &bvec, 1, length); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) r = 0; return r; } /* * write something. @more is true if caller will be sending more data * shortly. */ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, size_t kvlen, size_t len, int more) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; int r; if (more) msg.msg_flags |= MSG_MORE; else msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ r = kernel_sendmsg(sock, &msg, iov, kvlen, len); if (r == -EAGAIN) r = 0; return r; } static int __ceph_tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, bool more) { int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); int ret; ret = kernel_sendpage(sock, page, offset, size, flags); if (ret == -EAGAIN) ret = 0; return ret; } static int ceph_tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, bool more) { struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; struct bio_vec bvec; int ret; /* * sendpage cannot properly handle pages with page_count == 0, * we need to fall back to sendmsg if that's the case. * * Same goes for slab pages: skb_can_coalesce() allows * coalescing neighboring slab objects into a single frag which * triggers one of hardened usercopy checks. */ if (page_count(page) >= 1 && !PageSlab(page)) return __ceph_tcp_sendpage(sock, page, offset, size, more); bvec.bv_page = page; bvec.bv_offset = offset; bvec.bv_len = size; if (more) msg.msg_flags |= MSG_MORE; else msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, &bvec, 1, size); ret = sock_sendmsg(sock, &msg); if (ret == -EAGAIN) ret = 0; return ret; } /* * Shutdown/close the socket for the given connection. */ static int con_close_socket(struct ceph_connection *con) { int rc = 0; dout("con_close_socket on %p sock %p\n", con, con->sock); if (con->sock) { rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); sock_release(con->sock); con->sock = NULL; } /* * Forcibly clear the SOCK_CLOSED flag. It gets set * independent of the connection mutex, and we could have * received a socket close event before we had the chance to * shut the socket down. */ con_flag_clear(con, CON_FLAG_SOCK_CLOSED); con_sock_state_closed(con); return rc; } /* * Reset a connection. Discard all incoming and outgoing messages * and clear *_seq state. */ static void ceph_msg_remove(struct ceph_msg *msg) { list_del_init(&msg->list_head); ceph_msg_put(msg); } static void ceph_msg_remove_list(struct list_head *head) { while (!list_empty(head)) { struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, list_head); ceph_msg_remove(msg); } } static void reset_connection(struct ceph_connection *con) { /* reset connection, out_queue, msg_ and connect_seq */ /* discard existing out_queue and msg_seq */ dout("reset_connection %p\n", con); ceph_msg_remove_list(&con->out_queue); ceph_msg_remove_list(&con->out_sent); if (con->in_msg) { BUG_ON(con->in_msg->con != con); ceph_msg_put(con->in_msg); con->in_msg = NULL; } con->connect_seq = 0; con->out_seq = 0; if (con->out_msg) { BUG_ON(con->out_msg->con != con); ceph_msg_put(con->out_msg); con->out_msg = NULL; } con->in_seq = 0; con->in_seq_acked = 0; con->out_skip = 0; } /* * mark a peer down. drop any open connections. */ void ceph_con_close(struct ceph_connection *con) { mutex_lock(&con->mutex); dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr.in_addr)); con->state = CON_STATE_CLOSED; con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */ con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING); con_flag_clear(con, CON_FLAG_WRITE_PENDING); con_flag_clear(con, CON_FLAG_BACKOFF); reset_connection(con); con->peer_global_seq = 0; cancel_con(con); con_close_socket(con); mutex_unlock(&con->mutex); } EXPORT_SYMBOL(ceph_con_close); /* * Reopen a closed connection, with a new peer address. */ void ceph_con_open(struct ceph_connection *con, __u8 entity_type, __u64 entity_num, struct ceph_entity_addr *addr) { mutex_lock(&con->mutex); dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); WARN_ON(con->state != CON_STATE_CLOSED); con->state = CON_STATE_PREOPEN; con->peer_name.type = (__u8) entity_type; con->peer_name.num = cpu_to_le64(entity_num); memcpy(&con->peer_addr, addr, sizeof(*addr)); con->delay = 0; /* reset backoff memory */ mutex_unlock(&con->mutex); queue_con(con); } EXPORT_SYMBOL(ceph_con_open); /* * return true if this connection ever successfully opened */ bool ceph_con_opened(struct ceph_connection *con) { return con->connect_seq > 0; } /* * initialize a new connection. */ void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, struct ceph_messenger *msgr) { dout("con_init %p\n", con); memset(con, 0, sizeof(*con)); con->private = private; con->ops = ops; con->msgr = msgr; con_sock_state_init(con); mutex_init(&con->mutex); INIT_LIST_HEAD(&con->out_queue); INIT_LIST_HEAD(&con->out_sent); INIT_DELAYED_WORK(&con->work, ceph_con_workfn); con->state = CON_STATE_CLOSED; } EXPORT_SYMBOL(ceph_con_init); /* * We maintain a global counter to order connection attempts. Get * a unique seq greater than @gt. */ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) { u32 ret; spin_lock(&msgr->global_seq_lock); if (msgr->global_seq < gt) msgr->global_seq = gt; ret = ++msgr->global_seq; spin_unlock(&msgr->global_seq_lock); return ret; } static void con_out_kvec_reset(struct ceph_connection *con) { BUG_ON(con->out_skip); con->out_kvec_left = 0; con->out_kvec_bytes = 0; con->out_kvec_cur = &con->out_kvec[0]; } static void con_out_kvec_add(struct ceph_connection *con, size_t size, void *data) { int index = con->out_kvec_left; BUG_ON(con->out_skip); BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); con->out_kvec[index].iov_len = size; con->out_kvec[index].iov_base = data; con->out_kvec_left++; con->out_kvec_bytes += size; } /* * Chop off a kvec from the end. Return residual number of bytes for * that kvec, i.e. how many bytes would have been written if the kvec * hadn't been nuked. */ static int con_out_kvec_skip(struct ceph_connection *con) { int off = con->out_kvec_cur - con->out_kvec; int skip = 0; if (con->out_kvec_bytes > 0) { skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len; BUG_ON(con->out_kvec_bytes < skip); BUG_ON(!con->out_kvec_left); con->out_kvec_bytes -= skip; con->out_kvec_left--; } return skip; } #ifdef CONFIG_BLOCK /* * For a bio data item, a piece is whatever remains of the next * entry in the current bio iovec, or the first entry in the next * bio in the list. */ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; struct ceph_bio_iter *it = &cursor->bio_iter; cursor->resid = min_t(size_t, length, data->bio_length); *it = data->bio_pos; if (cursor->resid < it->iter.bi_size) it->iter.bi_size = cursor->resid; BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, cursor->bio_iter.iter); *page_offset = bv.bv_offset; *length = bv.bv_len; return bv.bv_page; } static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct ceph_bio_iter *it = &cursor->bio_iter; BUG_ON(bytes > cursor->resid); BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); cursor->resid -= bytes; bio_advance_iter(it->bio, &it->iter, bytes); if (!cursor->resid) { BUG_ON(!cursor->last_piece); return false; /* no more data */ } if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done)) return false; /* more bytes to process in this segment */ if (!it->iter.bi_size) { it->bio = it->bio->bi_next; it->iter = it->bio->bi_iter; if (cursor->resid < it->iter.bi_size) it->iter.bi_size = cursor->resid; } BUG_ON(cursor->last_piece); BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); return true; } #endif /* CONFIG_BLOCK */ static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; struct bio_vec *bvecs = data->bvec_pos.bvecs; cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size); cursor->bvec_iter = data->bvec_pos.iter; cursor->bvec_iter.bi_size = cursor->resid; BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); cursor->last_piece = cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter); } static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs, cursor->bvec_iter); *page_offset = bv.bv_offset; *length = bv.bv_len; return bv.bv_page; } static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs; BUG_ON(bytes > cursor->resid); BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter)); cursor->resid -= bytes; bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes); if (!cursor->resid) { BUG_ON(!cursor->last_piece); return false; /* no more data */ } if (!bytes || cursor->bvec_iter.bi_bvec_done) return false; /* more bytes to process in this segment */ BUG_ON(cursor->last_piece); BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); cursor->last_piece = cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter); return true; } /* * For a page array, a piece comes from the first page in the array * that has not already been fully consumed. */ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; int page_count; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); BUG_ON(!data->pages); BUG_ON(!data->length); cursor->resid = min(length, data->length); page_count = calc_pages_for(data->alignment, (u64)data->length); cursor->page_offset = data->alignment & ~PAGE_MASK; cursor->page_index = 0; BUG_ON(page_count > (int)USHRT_MAX); cursor->page_count = (unsigned short)page_count; BUG_ON(length > SIZE_MAX - cursor->page_offset); cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE; } static struct page * ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct ceph_msg_data *data = cursor->data; BUG_ON(data->type != CEPH_MSG_DATA_PAGES); BUG_ON(cursor->page_index >= cursor->page_count); BUG_ON(cursor->page_offset >= PAGE_SIZE); *page_offset = cursor->page_offset; if (cursor->last_piece) *length = cursor->resid; else *length = PAGE_SIZE - *page_offset; return data->pages[cursor->page_index]; } static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES); BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); /* Advance the cursor page offset */ cursor->resid -= bytes; cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK; if (!bytes || cursor->page_offset) return false; /* more bytes to process in the current page */ if (!cursor->resid) return false; /* no more data */ /* Move on to the next page; offset is already at 0 */ BUG_ON(cursor->page_index >= cursor->page_count); cursor->page_index++; cursor->last_piece = cursor->resid <= PAGE_SIZE; return true; } /* * For a pagelist, a piece is whatever remains to be consumed in the * first page in the list, or the front of the next page. */ static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, size_t length) { struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; struct page *page; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); if (!length) return; /* pagelist can be assigned but empty */ BUG_ON(list_empty(&pagelist->head)); page = list_first_entry(&pagelist->head, struct page, lru); cursor->resid = min(length, pagelist->length); cursor->page = page; cursor->offset = 0; cursor->last_piece = cursor->resid <= PAGE_SIZE; } static struct page * ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length) { struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); BUG_ON(!cursor->page); BUG_ON(cursor->offset + cursor->resid != pagelist->length); /* offset of first page in pagelist is always 0 */ *page_offset = cursor->offset & ~PAGE_MASK; if (cursor->last_piece) *length = cursor->resid; else *length = PAGE_SIZE - *page_offset; return cursor->page; } static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { struct ceph_msg_data *data = cursor->data; struct ceph_pagelist *pagelist; BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); pagelist = data->pagelist; BUG_ON(!pagelist); BUG_ON(cursor->offset + cursor->resid != pagelist->length); BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); /* Advance the cursor offset */ cursor->resid -= bytes; cursor->offset += bytes; /* offset of first page in pagelist is always 0 */ if (!bytes || cursor->offset & ~PAGE_MASK) return false; /* more bytes to process in the current page */ if (!cursor->resid) return false; /* no more data */ /* Move on to the next page */ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); cursor->page = list_next_entry(cursor->page, lru); cursor->last_piece = cursor->resid <= PAGE_SIZE; return true; } /* * Message data is handled (sent or received) in pieces, where each * piece resides on a single page. The network layer might not * consume an entire piece at once. A data item's cursor keeps * track of which piece is next to process and how much remains to * be processed in that piece. It also tracks whether the current * piece is the last one in the data item. */ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) { size_t length = cursor->total_resid; switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: ceph_msg_data_pagelist_cursor_init(cursor, length); break; case CEPH_MSG_DATA_PAGES: ceph_msg_data_pages_cursor_init(cursor, length); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: ceph_msg_data_bio_cursor_init(cursor, length); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_BVECS: ceph_msg_data_bvecs_cursor_init(cursor, length); break; case CEPH_MSG_DATA_NONE: default: /* BUG(); */ break; } cursor->need_crc = true; } static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) { struct ceph_msg_data_cursor *cursor = &msg->cursor; struct ceph_msg_data *data; BUG_ON(!length); BUG_ON(length > msg->data_length); BUG_ON(list_empty(&msg->data)); cursor->data_head = &msg->data; cursor->total_resid = length; data = list_first_entry(&msg->data, struct ceph_msg_data, links); cursor->data = data; __ceph_msg_data_cursor_init(cursor); } /* * Return the page containing the next piece to process for a given * data item, and supply the page offset and length of that piece. * Indicate whether this is the last piece in this data item. */ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, size_t *page_offset, size_t *length, bool *last_piece) { struct page *page; switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: page = ceph_msg_data_pagelist_next(cursor, page_offset, length); break; case CEPH_MSG_DATA_PAGES: page = ceph_msg_data_pages_next(cursor, page_offset, length); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: page = ceph_msg_data_bio_next(cursor, page_offset, length); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_BVECS: page = ceph_msg_data_bvecs_next(cursor, page_offset, length); break; case CEPH_MSG_DATA_NONE: default: page = NULL; break; } BUG_ON(!page); BUG_ON(*page_offset + *length > PAGE_SIZE); BUG_ON(!*length); BUG_ON(*length > cursor->resid); if (last_piece) *last_piece = cursor->last_piece; return page; } /* * Returns true if the result moves the cursor on to the next piece * of the data item. */ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) { bool new_piece; BUG_ON(bytes > cursor->resid); switch (cursor->data->type) { case CEPH_MSG_DATA_PAGELIST: new_piece = ceph_msg_data_pagelist_advance(cursor, bytes); break; case CEPH_MSG_DATA_PAGES: new_piece = ceph_msg_data_pages_advance(cursor, bytes); break; #ifdef CONFIG_BLOCK case CEPH_MSG_DATA_BIO: new_piece = ceph_msg_data_bio_advance(cursor, bytes); break; #endif /* CONFIG_BLOCK */ case CEPH_MSG_DATA_BVECS: new_piece = ceph_msg_data_bvecs_advance(cursor, bytes); break; case CEPH_MSG_DATA_NONE: default: BUG(); break; } cursor->total_resid -= bytes; if (!cursor->resid && cursor->total_resid) { WARN_ON(!cursor->last_piece); BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); cursor->data = list_next_entry(cursor->data, links); __ceph_msg_data_cursor_init(cursor); new_piece = true; } cursor->need_crc = new_piece; } static size_t sizeof_footer(struct ceph_connection *con) { return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ? sizeof(struct ceph_msg_footer) : sizeof(struct ceph_msg_footer_old); } static void prepare_message_data(struct ceph_msg *msg, u32 data_len) { BUG_ON(!msg); BUG_ON(!data_len); /* Initialize data cursor */ ceph_msg_data_cursor_init(msg, (size_t)data_len); } /* * Prepare footer for currently outgoing message, and finish things * off. Assumes out_kvec* are already valid.. we just add on to the end. */ static void prepare_write_message_footer(struct ceph_connection *con) { struct ceph_msg *m = con->out_msg; m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); con_out_kvec_add(con, sizeof_footer(con), &m->footer); if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { if (con->ops->sign_message) con->ops->sign_message(m); else m->footer.sig = 0; } else { m->old_footer.flags = m->footer.flags; } con->out_more = m->more_to_follow; con->out_msg_done = true; } /* * Prepare headers for the next outgoing message. */ static void prepare_write_message(struct ceph_connection *con) { struct ceph_msg *m; u32 crc; con_out_kvec_reset(con); con->out_msg_done = false; /* Sneak an ack in there first? If we can get it into the same * TCP packet that's a good thing. */ if (con->in_seq > con->in_seq_acked) { con->in_seq_acked = con->in_seq; con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); con->out_temp_ack = cpu_to_le64(con->in_seq_acked); con_out_kvec_add(con, sizeof (con->out_temp_ack), &con->out_temp_ack); } BUG_ON(list_empty(&con->out_queue)); m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); con->out_msg = m; BUG_ON(m->con != con); /* put message on sent list */ ceph_msg_get(m); list_move_tail(&m->list_head, &con->out_sent); /* * only assign outgoing seq # if we haven't sent this message * yet. if it is requeued, resend with it's original seq. */ if (m->needs_out_seq) { m->hdr.seq = cpu_to_le64(++con->out_seq); m->needs_out_seq = false; if (con->ops->reencode_message) con->ops->reencode_message(m); } dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), m->data_length); WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len)); WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); /* tag + hdr + front + middle */ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr); con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); if (m->middle) con_out_kvec_add(con, m->middle->vec.iov_len, m->middle->vec.iov_base); /* fill in hdr crc and finalize hdr */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); con->out_msg->hdr.crc = cpu_to_le32(crc); memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr)); /* fill in front and middle crc, footer */ crc = crc32c(0, m->front.iov_base, m->front.iov_len); con->out_msg->footer.front_crc = cpu_to_le32(crc); if (m->middle) { crc = crc32c(0, m->middle->vec.iov_base, m->middle->vec.iov_len); con->out_msg->footer.middle_crc = cpu_to_le32(crc); } else con->out_msg->footer.middle_crc = 0; dout("%s front_crc %u middle_crc %u\n", __func__, le32_to_cpu(con->out_msg->footer.front_crc), le32_to_cpu(con->out_msg->footer.middle_crc)); con->out_msg->footer.flags = 0; /* is there a data payload? */ con->out_msg->footer.data_crc = 0; if (m->data_length) { prepare_message_data(con->out_msg, m->data_length); con->out_more = 1; /* data + footer will follow */ } else { /* no, queue up footer too and be done */ prepare_write_message_footer(con); } con_flag_set(con, CON_FLAG_WRITE_PENDING); } /* * Prepare an ack. */ static void prepare_write_ack(struct ceph_connection *con) { dout("prepare_write_ack %p %llu -> %llu\n", con, con->in_seq_acked, con->in_seq); con->in_seq_acked = con->in_seq; con_out_kvec_reset(con); con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); con->out_temp_ack = cpu_to_le64(con->in_seq_acked); con_out_kvec_add(con, sizeof (con->out_temp_ack), &con->out_temp_ack); con->out_more = 1; /* more will follow.. eventually.. */ con_flag_set(con, CON_FLAG_WRITE_PENDING); } /* * Prepare to share the seq during handshake */ static void prepare_write_seq(struct ceph_connection *con) { dout("prepare_write_seq %p %llu -> %llu\n", con, con->in_seq_acked, con->in_seq); con->in_seq_acked = con->in_seq; con_out_kvec_reset(con); con->out_temp_ack = cpu_to_le64(con->in_seq_acked); con_out_kvec_add(con, sizeof (con->out_temp_ack), &con->out_temp_ack); con_flag_set(con, CON_FLAG_WRITE_PENDING); } /* * Prepare to write keepalive byte. */ static void prepare_write_keepalive(struct ceph_connection *con) { dout("prepare_write_keepalive %p\n", con); con_out_kvec_reset(con); if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { struct timespec64 now; ktime_get_real_ts64(&now); con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); ceph_encode_timespec64(&con->out_temp_keepalive2, &now); con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), &con->out_temp_keepalive2); } else { con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); } con_flag_set(con, CON_FLAG_WRITE_PENDING); } /* * Connection negotiation. */ static int get_connect_authorizer(struct ceph_connection *con) { struct ceph_auth_handshake *auth; int auth_proto; if (!con->ops->get_authorizer) { con->auth = NULL; con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; con->out_connect.authorizer_len = 0; return 0; } auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry); if (IS_ERR(auth)) return PTR_ERR(auth); con->auth = auth; con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len); return 0; } /* * We connected to a peer and are saying hello. */ static void prepare_write_banner(struct ceph_connection *con) { con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), &con->msgr->my_enc_addr); con->out_more = 0; con_flag_set(con, CON_FLAG_WRITE_PENDING); } static void __prepare_write_connect(struct ceph_connection *con) { con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect); if (con->auth) con_out_kvec_add(con, con->auth->authorizer_buf_len, con->auth->authorizer_buf); con->out_more = 0; con_flag_set(con, CON_FLAG_WRITE_PENDING); } static int prepare_write_connect(struct ceph_connection *con) { unsigned int global_seq = get_global_seq(con->msgr, 0); int proto; int ret; switch (con->peer_name.type) { case CEPH_ENTITY_TYPE_MON: proto = CEPH_MONC_PROTOCOL; break; case CEPH_ENTITY_TYPE_OSD: proto = CEPH_OSDC_PROTOCOL; break; case CEPH_ENTITY_TYPE_MDS: proto = CEPH_MDSC_PROTOCOL; break; default: BUG(); } dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, con->connect_seq, global_seq, proto); con->out_connect.features = cpu_to_le64(from_msgr(con->msgr)->supported_features); con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); con->out_connect.global_seq = cpu_to_le32(global_seq); con->out_connect.protocol_version = cpu_to_le32(proto); con->out_connect.flags = 0; ret = get_connect_authorizer(con); if (ret) return ret; __prepare_write_connect(con); return 0; } /* * write as much of pending kvecs to the socket as we can. * 1 -> done * 0 -> socket full, but more to do * <0 -> error */ static int write_partial_kvec(struct ceph_connection *con) { int ret; dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes); while (con->out_kvec_bytes > 0) { ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur, con->out_kvec_left, con->out_kvec_bytes, con->out_more); if (ret <= 0) goto out; con->out_kvec_bytes -= ret; if (con->out_kvec_bytes == 0) break; /* done */ /* account for full iov entries consumed */ while (ret >= con->out_kvec_cur->iov_len) { BUG_ON(!con->out_kvec_left); ret -= con->out_kvec_cur->iov_len; con->out_kvec_cur++; con->out_kvec_left--; } /* and for a partially-consumed entry */ if (ret) { con->out_kvec_cur->iov_len -= ret; con->out_kvec_cur->iov_base += ret; } } con->out_kvec_left = 0; ret = 1; out: dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, con->out_kvec_bytes, con->out_kvec_left, ret); return ret; /* done! */ } static u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset, unsigned int length) { char *kaddr; kaddr = kmap(page); BUG_ON(kaddr == NULL); crc = crc32c(crc, kaddr + page_offset, length); kunmap(page); return crc; } /* * Write as much message data payload as we can. If we finish, queue * up the footer. * 1 -> done, footer is now queued in out_kvec[]. * 0 -> socket full, but more to do * <0 -> error */ static int write_partial_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; struct ceph_msg_data_cursor *cursor = &msg->cursor; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); u32 crc; dout("%s %p msg %p\n", __func__, con, msg); if (list_empty(&msg->data)) return -EINVAL; /* * Iterate through each page that contains data to be * written, and send as much as possible for each. * * If we are calculating the data crc (the default), we will * need to map the page. If we have no pages, they have * been revoked, so use the zero page. */ crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; while (cursor->total_resid) { struct page *page; size_t page_offset; size_t length; bool last_piece; int ret; if (!cursor->resid) { ceph_msg_data_advance(cursor, 0); continue; } page = ceph_msg_data_next(cursor, &page_offset, &length, &last_piece); ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, !last_piece); if (ret <= 0) { if (do_datacrc) msg->footer.data_crc = cpu_to_le32(crc); return ret; } if (do_datacrc && cursor->need_crc) crc = ceph_crc32c_page(crc, page, page_offset, length); ceph_msg_data_advance(cursor, (size_t)ret); } dout("%s %p msg %p done\n", __func__, con, msg); /* prepare and queue up footer, too */ if (do_datacrc) msg->footer.data_crc = cpu_to_le32(crc); else msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; con_out_kvec_reset(con); prepare_write_message_footer(con); return 1; /* must return > 0 to indicate success */ } /* * write some zeros */ static int write_partial_skip(struct ceph_connection *con) { int ret; dout("%s %p %d left\n", __func__, con, con->out_skip); while (con->out_skip > 0) { size_t size = min(con->out_skip, (int) PAGE_SIZE); ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true); if (ret <= 0) goto out; con->out_skip -= ret; } ret = 1; out: return ret; } /* * Prepare to read connection handshake, or an ack. */ static void prepare_read_banner(struct ceph_connection *con) { dout("prepare_read_banner %p\n", con); con->in_base_pos = 0; } static void prepare_read_connect(struct ceph_connection *con) { dout("prepare_read_connect %p\n", con); con->in_base_pos = 0; } static void prepare_read_ack(struct ceph_connection *con) { dout("prepare_read_ack %p\n", con); con->in_base_pos = 0; } static void prepare_read_seq(struct ceph_connection *con) { dout("prepare_read_seq %p\n", con); con->in_base_pos = 0; con->in_tag = CEPH_MSGR_TAG_SEQ; } static void prepare_read_tag(struct ceph_connection *con) { dout("prepare_read_tag %p\n", con); con->in_base_pos = 0; con->in_tag = CEPH_MSGR_TAG_READY; } static void prepare_read_keepalive_ack(struct ceph_connection *con) { dout("prepare_read_keepalive_ack %p\n", con); con->in_base_pos = 0; } /* * Prepare to read a message. */ static int prepare_read_message(struct ceph_connection *con) { dout("prepare_read_message %p\n", con); BUG_ON(con->in_msg != NULL); con->in_base_pos = 0; con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; return 0; } static int read_partial(struct ceph_connection *con, int end, int size, void *object) { while (con->in_base_pos < end) { int left = end - con->in_base_pos; int have = size - left; int ret = ceph_tcp_recvmsg(con->sock, object + have, left); if (ret <= 0) return ret; con->in_base_pos += ret; } return 1; } /* * Read all or part of the connect-side handshake on a new connection */ static int read_partial_banner(struct ceph_connection *con) { int size; int end; int ret; dout("read_partial_banner %p at %d\n", con, con->in_base_pos); /* peer's banner */ size = strlen(CEPH_BANNER); end = size; ret = read_partial(con, end, size, con->in_banner); if (ret <= 0) goto out; size = sizeof (con->actual_peer_addr); end += size; ret = read_partial(con, end, size, &con->actual_peer_addr); if (ret <= 0) goto out; size = sizeof (con->peer_addr_for_me); end += size; ret = read_partial(con, end, size, &con->peer_addr_for_me); if (ret <= 0) goto out; out: return ret; } static int read_partial_connect(struct ceph_connection *con) { int size; int end; int ret; dout("read_partial_connect %p at %d\n", con, con->in_base_pos); size = sizeof (con->in_reply); end = size; ret = read_partial(con, end, size, &con->in_reply); if (ret <= 0) goto out; if (con->auth) { size = le32_to_cpu(con->in_reply.authorizer_len); if (size > con->auth->authorizer_reply_buf_len) { pr_err("authorizer reply too big: %d > %zu\n", size, con->auth->authorizer_reply_buf_len); ret = -EINVAL; goto out; } end += size; ret = read_partial(con, end, size, con->auth->authorizer_reply_buf); if (ret <= 0) goto out; } dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", con, (int)con->in_reply.tag, le32_to_cpu(con->in_reply.connect_seq), le32_to_cpu(con->in_reply.global_seq)); out: return ret; } /* * Verify the hello banner looks okay. */ static int verify_hello(struct ceph_connection *con) { if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { pr_err("connect to %s got bad banner\n", ceph_pr_addr(&con->peer_addr.in_addr)); con->error_msg = "protocol error, bad banner"; return -1; } return 0; } static bool addr_is_blank(struct sockaddr_storage *ss) { struct in_addr *addr = &((struct sockaddr_in *)ss)->sin_addr; struct in6_addr *addr6 = &((struct sockaddr_in6 *)ss)->sin6_addr; switch (ss->ss_family) { case AF_INET: return addr->s_addr == htonl(INADDR_ANY); case AF_INET6: return ipv6_addr_any(addr6); default: return true; } } static int addr_port(struct sockaddr_storage *ss) { switch (ss->ss_family) { case AF_INET: return ntohs(((struct sockaddr_in *)ss)->sin_port); case AF_INET6: return ntohs(((struct sockaddr_in6 *)ss)->sin6_port); } return 0; } static void addr_set_port(struct sockaddr_storage *ss, int p) { switch (ss->ss_family) { case AF_INET: ((struct sockaddr_in *)ss)->sin_port = htons(p); break; case AF_INET6: ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); break; } } /* * Unlike other *_pton function semantics, zero indicates success. */ static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss, char delim, const char **ipend) { struct sockaddr_in *in4 = (struct sockaddr_in *) ss; struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; memset(ss, 0, sizeof(*ss)); if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) { ss->ss_family = AF_INET; return 0; } if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) { ss->ss_family = AF_INET6; return 0; } return -EINVAL; } /* * Extract hostname string and resolve using kernel DNS facility. */ #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER static int ceph_dns_resolve_name(const char *name, size_t namelen, struct sockaddr_storage *ss, char delim, const char **ipend) { const char *end, *delim_p; char *colon_p, *ip_addr = NULL; int ip_len, ret; /* * The end of the hostname occurs immediately preceding the delimiter or * the port marker (':') where the delimiter takes precedence. */ delim_p = memchr(name, delim, namelen); colon_p = memchr(name, ':', namelen); if (delim_p && colon_p) end = delim_p < colon_p ? delim_p : colon_p; else if (!delim_p && colon_p) end = colon_p; else { end = delim_p; if (!end) /* case: hostname:/ */ end = name + namelen; } if (end <= name) return -EINVAL; /* do dns_resolve upcall */ ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL); if (ip_len > 0) ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL); else ret = -ESRCH; kfree(ip_addr); *ipend = end; pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, ret, ret ? "failed" : ceph_pr_addr(ss)); return ret; } #else static inline int ceph_dns_resolve_name(const char *name, size_t namelen, struct sockaddr_storage *ss, char delim, const char **ipend) { return -EINVAL; } #endif /* * Parse a server name (IP or hostname). If a valid IP address is not found * then try to extract a hostname to resolve using userspace DNS upcall. */ static int ceph_parse_server_name(const char *name, size_t namelen, struct sockaddr_storage *ss, char delim, const char **ipend) { int ret; ret = ceph_pton(name, namelen, ss, delim, ipend); if (ret) ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend); return ret; } /* * Parse an ip[:port] list into an addr array. Use the default * monitor port if a port isn't specified. */ int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, int max_count, int *count) { int i, ret = -EINVAL; const char *p = c; dout("parse_ips on '%.*s'\n", (int)(end-c), c); for (i = 0; i < max_count; i++) { const char *ipend; struct sockaddr_storage *ss = &addr[i].in_addr; int port; char delim = ','; if (*p == '[') { delim = ']'; p++; } ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend); if (ret) goto bad; ret = -EINVAL; p = ipend; if (delim == ']') { if (*p != ']') { dout("missing matching ']'\n"); goto bad; } p++; } /* port? */ if (p < end && *p == ':') { port = 0; p++; while (p < end && *p >= '0' && *p <= '9') { port = (port * 10) + (*p - '0'); p++; } if (port == 0) port = CEPH_MON_PORT; else if (port > 65535) goto bad; } else { port = CEPH_MON_PORT; } addr_set_port(ss, port); dout("parse_ips got %s\n", ceph_pr_addr(ss)); if (p == end) break; if (*p != ',') goto bad; p++; } if (p != end) goto bad; if (count) *count = i + 1; return 0; bad: pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); return ret; } EXPORT_SYMBOL(ceph_parse_ips); static int process_banner(struct ceph_connection *con) { dout("process_banner on %p\n", con); if (verify_hello(con) < 0) return -1; ceph_decode_addr(&con->actual_peer_addr); ceph_decode_addr(&con->peer_addr_for_me); /* * Make sure the other end is who we wanted. note that the other * end may not yet know their ip address, so if it's 0.0.0.0, give * them the benefit of the doubt. */ if (memcmp(&con->peer_addr, &con->actual_peer_addr, sizeof(con->peer_addr)) != 0 && !(addr_is_blank(&con->actual_peer_addr.in_addr) && con->actual_peer_addr.nonce == con->peer_addr.nonce)) { pr_warn("wrong peer, want %s/%d, got %s/%d\n", ceph_pr_addr(&con->peer_addr.in_addr), (int)le32_to_cpu(con->peer_addr.nonce), ceph_pr_addr(&con->actual_peer_addr.in_addr), (int)le32_to_cpu(con->actual_peer_addr.nonce)); con->error_msg = "wrong peer at address"; return -1; } /* * did we learn our address? */ if (addr_is_blank(&con->msgr->inst.addr.in_addr)) { int port = addr_port(&con->msgr->inst.addr.in_addr); memcpy(&con->msgr->inst.addr.in_addr, &con->peer_addr_for_me.in_addr, sizeof(con->peer_addr_for_me.in_addr)); addr_set_port(&con->msgr->inst.addr.in_addr, port); encode_my_addr(con->msgr); dout("process_banner learned my addr is %s\n", ceph_pr_addr(&con->msgr->inst.addr.in_addr)); } return 0; } static int process_connect(struct ceph_connection *con) { u64 sup_feat = from_msgr(con->msgr)->supported_features; u64 req_feat = from_msgr(con->msgr)->required_features; u64 server_feat = le64_to_cpu(con->in_reply.features); int ret; dout("process_connect on %p tag %d\n", con, (int)con->in_tag); if (con->auth) { int len = le32_to_cpu(con->in_reply.authorizer_len); /* * Any connection that defines ->get_authorizer() * should also define ->add_authorizer_challenge() and * ->verify_authorizer_reply(). * * See get_connect_authorizer(). */ if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { ret = con->ops->add_authorizer_challenge( con, con->auth->authorizer_reply_buf, len); if (ret < 0) return ret; con_out_kvec_reset(con); __prepare_write_connect(con); prepare_read_connect(con); return 0; } if (len) { ret = con->ops->verify_authorizer_reply(con); if (ret < 0) { con->error_msg = "bad authorize reply"; return ret; } } } switch (con->in_reply.tag) { case CEPH_MSGR_TAG_FEATURES: pr_err("%s%lld %s feature set mismatch," " my %llx < server's %llx, missing %llx\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), sup_feat, server_feat, server_feat & ~sup_feat); con->error_msg = "missing required protocol features"; reset_connection(con); return -1; case CEPH_MSGR_TAG_BADPROTOVER: pr_err("%s%lld %s protocol version mismatch," " my %d != server's %d\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), le32_to_cpu(con->out_connect.protocol_version), le32_to_cpu(con->in_reply.protocol_version)); con->error_msg = "protocol version mismatch"; reset_connection(con); return -1; case CEPH_MSGR_TAG_BADAUTHORIZER: con->auth_retry++; dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, con->auth_retry); if (con->auth_retry == 2) { con->error_msg = "connect authorization failure"; return -1; } con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; prepare_read_connect(con); break; case CEPH_MSGR_TAG_RESETSESSION: /* * If we connected with a large connect_seq but the peer * has no record of a session with us (no connection, or * connect_seq == 0), they will send RESETSESION to indicate * that they must have reset their session, and may have * dropped messages. */ dout("process_connect got RESET peer seq %u\n", le32_to_cpu(con->in_reply.connect_seq)); pr_err("%s%lld %s connection reset\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; prepare_read_connect(con); /* Tell ceph about it. */ mutex_unlock(&con->mutex); pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name)); if (con->ops->peer_reset) con->ops->peer_reset(con); mutex_lock(&con->mutex); if (con->state != CON_STATE_NEGOTIATING) return -EAGAIN; break; case CEPH_MSGR_TAG_RETRY_SESSION: /* * If we sent a smaller connect_seq than the peer has, try * again with a larger value. */ dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", le32_to_cpu(con->out_connect.connect_seq), le32_to_cpu(con->in_reply.connect_seq)); con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; prepare_read_connect(con); break; case CEPH_MSGR_TAG_RETRY_GLOBAL: /* * If we sent a smaller global_seq than the peer has, try * again with a larger value. */ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", con->peer_global_seq, le32_to_cpu(con->in_reply.global_seq)); get_global_seq(con->msgr, le32_to_cpu(con->in_reply.global_seq)); con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; prepare_read_connect(con); break; case CEPH_MSGR_TAG_SEQ: case CEPH_MSGR_TAG_READY: if (req_feat & ~server_feat) { pr_err("%s%lld %s protocol feature mismatch," " my required %llx > server's %llx, need %llx\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), req_feat, server_feat, req_feat & ~server_feat); con->error_msg = "missing required protocol features"; reset_connection(con); return -1; } WARN_ON(con->state != CON_STATE_NEGOTIATING); con->state = CON_STATE_OPEN; con->auth_retry = 0; /* we authenticated; clear flag */ con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; con->peer_features = server_feat; dout("process_connect got READY gseq %d cseq %d (%d)\n", con->peer_global_seq, le32_to_cpu(con->in_reply.connect_seq), con->connect_seq); WARN_ON(con->connect_seq != le32_to_cpu(con->in_reply.connect_seq)); if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) con_flag_set(con, CON_FLAG_LOSSYTX); con->delay = 0; /* reset backoff memory */ if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) { prepare_write_seq(con); prepare_read_seq(con); } else { prepare_read_tag(con); } break; case CEPH_MSGR_TAG_WAIT: /* * If there is a connection race (we are opening * connections to each other), one of us may just have * to WAIT. This shouldn't happen if we are the * client. */ con->error_msg = "protocol error, got WAIT as client"; return -1; default: con->error_msg = "protocol error, garbage tag during connect"; return -1; } return 0; } /* * read (part of) an ack */ static int read_partial_ack(struct ceph_connection *con) { int size = sizeof (con->in_temp_ack); int end = size; return read_partial(con, end, size, &con->in_temp_ack); } /* * We can finally discard anything that's been acked. */ static void process_ack(struct ceph_connection *con) { struct ceph_msg *m; u64 ack = le64_to_cpu(con->in_temp_ack); u64 seq; bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ); struct list_head *list = reconnect ? &con->out_queue : &con->out_sent; /* * In the reconnect case, con_fault() has requeued messages * in out_sent. We should cleanup old messages according to * the reconnect seq. */ while (!list_empty(list)) { m = list_first_entry(list, struct ceph_msg, list_head); if (reconnect && m->needs_out_seq) break; seq = le64_to_cpu(m->hdr.seq); if (seq > ack) break; dout("got ack for seq %llu type %d at %p\n", seq, le16_to_cpu(m->hdr.type), m); m->ack_stamp = jiffies; ceph_msg_remove(m); } prepare_read_tag(con); } static int read_partial_message_section(struct ceph_connection *con, struct kvec *section, unsigned int sec_len, u32 *crc) { int ret, left; BUG_ON(!section); while (section->iov_len < sec_len) { BUG_ON(section->iov_base == NULL); left = sec_len - section->iov_len; ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + section->iov_len, left); if (ret <= 0) return ret; section->iov_len += ret; } if (section->iov_len == sec_len) *crc = crc32c(0, section->iov_base, section->iov_len); return 1; } static int read_partial_msg_data(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; struct ceph_msg_data_cursor *cursor = &msg->cursor; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); struct page *page; size_t page_offset; size_t length; u32 crc = 0; int ret; BUG_ON(!msg); if (list_empty(&msg->data)) return -EIO; if (do_datacrc) crc = con->in_data_crc; while (cursor->total_resid) { if (!cursor->resid) { ceph_msg_data_advance(cursor, 0); continue; } page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) { if (do_datacrc) con->in_data_crc = crc; return ret; } if (do_datacrc) crc = ceph_crc32c_page(crc, page, page_offset, ret); ceph_msg_data_advance(cursor, (size_t)ret); } if (do_datacrc) con->in_data_crc = crc; return 1; /* must return > 0 to indicate success */ } /* * read (part of) a message. */ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; int size; int end; int ret; unsigned int front_len, middle_len, data_len; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); u64 seq; u32 crc; dout("read_partial_message con %p msg %p\n", con, m); /* header */ size = sizeof (con->in_hdr); end = size; ret = read_partial(con, end, size, &con->in_hdr); if (ret <= 0) return ret; crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); if (cpu_to_le32(crc) != con->in_hdr.crc) { pr_err("read_partial_message bad hdr crc %u != expected %u\n", crc, con->in_hdr.crc); return -EBADMSG; } front_len = le32_to_cpu(con->in_hdr.front_len); if (front_len > CEPH_MSG_MAX_FRONT_LEN) return -EIO; middle_len = le32_to_cpu(con->in_hdr.middle_len); if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) return -EIO; data_len = le32_to_cpu(con->in_hdr.data_len); if (data_len > CEPH_MSG_MAX_DATA_LEN) return -EIO; /* verify seq# */ seq = le64_to_cpu(con->in_hdr.seq); if ((s64)seq - (s64)con->in_seq < 1) { pr_info("skipping %s%lld %s seq %lld expected %lld\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), seq, con->in_seq + 1); con->in_base_pos = -front_len - middle_len - data_len - sizeof_footer(con); con->in_tag = CEPH_MSGR_TAG_READY; return 1; } else if ((s64)seq - (s64)con->in_seq > 1) { pr_err("read_partial_message bad seq %lld expected %lld\n", seq, con->in_seq + 1); con->error_msg = "bad message sequence # for incoming message"; return -EBADE; } /* allocate message? */ if (!con->in_msg) { int skip = 0; dout("got hdr type %d front %d data %d\n", con->in_hdr.type, front_len, data_len); ret = ceph_con_in_msg_alloc(con, &skip); if (ret < 0) return ret; BUG_ON(!con->in_msg ^ skip); if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); con->in_base_pos = -front_len - middle_len - data_len - sizeof_footer(con); con->in_tag = CEPH_MSGR_TAG_READY; con->in_seq++; return 1; } BUG_ON(!con->in_msg); BUG_ON(con->in_msg->con != con); m = con->in_msg; m->front.iov_len = 0; /* haven't read it yet */ if (m->middle) m->middle->vec.iov_len = 0; /* prepare for data payload, if any */ if (data_len) prepare_message_data(con->in_msg, data_len); } /* front */ ret = read_partial_message_section(con, &m->front, front_len, &con->in_front_crc); if (ret <= 0) return ret; /* middle */ if (m->middle) { ret = read_partial_message_section(con, &m->middle->vec, middle_len, &con->in_middle_crc); if (ret <= 0) return ret; } /* (page) data */ if (data_len) { ret = read_partial_msg_data(con); if (ret <= 0) return ret; } /* footer */ size = sizeof_footer(con); end += size; ret = read_partial(con, end, size, &m->footer); if (ret <= 0) return ret; if (!need_sign) { m->footer.flags = m->old_footer.flags; m->footer.sig = 0; } dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", m, front_len, m->footer.front_crc, middle_len, m->footer.middle_crc, data_len, m->footer.data_crc); /* crc ok? */ if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { pr_err("read_partial_message %p front crc %u != exp. %u\n", m, con->in_front_crc, m->footer.front_crc); return -EBADMSG; } if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { pr_err("read_partial_message %p middle crc %u != exp %u\n", m, con->in_middle_crc, m->footer.middle_crc); return -EBADMSG; } if (do_datacrc && (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { pr_err("read_partial_message %p data crc %u != exp. %u\n", m, con->in_data_crc, le32_to_cpu(m->footer.data_crc)); return -EBADMSG; } if (need_sign && con->ops->check_message_signature && con->ops->check_message_signature(m)) { pr_err("read_partial_message %p signature check failed\n", m); return -EBADMSG; } return 1; /* done! */ } /* * Process message. This happens in the worker thread. The callback should * be careful not to do anything that waits on other incoming messages or it * may deadlock. */ static void process_message(struct ceph_connection *con) { struct ceph_msg *msg = con->in_msg; BUG_ON(con->in_msg->con != con); con->in_msg = NULL; /* if first message, set peer_name */ if (con->peer_name.type == 0) con->peer_name = msg->hdr.src; con->in_seq++; mutex_unlock(&con->mutex); dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", msg, le64_to_cpu(msg->hdr.seq), ENTITY_NAME(msg->hdr.src), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.data_len), con->in_front_crc, con->in_middle_crc, con->in_data_crc); con->ops->dispatch(con, msg); mutex_lock(&con->mutex); } static int read_keepalive_ack(struct ceph_connection *con) { struct ceph_timespec ceph_ts; size_t size = sizeof(ceph_ts); int ret = read_partial(con, size, size, &ceph_ts); if (ret <= 0) return ret; ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts); prepare_read_tag(con); return 1; } /* * Write something to the socket. Called in a worker thread when the * socket appears to be writeable and we have something ready to send. */ static int try_write(struct ceph_connection *con) { int ret = 1; dout("try_write start %p state %lu\n", con, con->state); if (con->state != CON_STATE_PREOPEN && con->state != CON_STATE_CONNECTING && con->state != CON_STATE_NEGOTIATING && con->state != CON_STATE_OPEN) return 0; /* open the socket first? */ if (con->state == CON_STATE_PREOPEN) { BUG_ON(con->sock); con->state = CON_STATE_CONNECTING; con_out_kvec_reset(con); prepare_write_banner(con); prepare_read_banner(con); BUG_ON(con->in_msg); con->in_tag = CEPH_MSGR_TAG_READY; dout("try_write initiating connect on %p new state %lu\n", con, con->state); ret = ceph_tcp_connect(con); if (ret < 0) { con->error_msg = "connect error"; goto out; } } more: dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); BUG_ON(!con->sock); /* kvec data queued? */ if (con->out_kvec_left) { ret = write_partial_kvec(con); if (ret <= 0) goto out; } if (con->out_skip) { ret = write_partial_skip(con); if (ret <= 0) goto out; } /* msg pages? */ if (con->out_msg) { if (con->out_msg_done) { ceph_msg_put(con->out_msg); con->out_msg = NULL; /* we're done with this one */ goto do_next; } ret = write_partial_message_data(con); if (ret == 1) goto more; /* we need to send the footer, too! */ if (ret == 0) goto out; if (ret < 0) { dout("try_write write_partial_message_data err %d\n", ret); goto out; } } do_next: if (con->state == CON_STATE_OPEN) { if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { prepare_write_keepalive(con); goto more; } /* is anything else pending? */ if (!list_empty(&con->out_queue)) { prepare_write_message(con); goto more; } if (con->in_seq > con->in_seq_acked) { prepare_write_ack(con); goto more; } } /* Nothing to do! */ con_flag_clear(con, CON_FLAG_WRITE_PENDING); dout("try_write nothing else to write.\n"); ret = 0; out: dout("try_write done on %p ret %d\n", con, ret); return ret; } /* * Read what we can from the socket. */ static int try_read(struct ceph_connection *con) { int ret = -1; more: dout("try_read start on %p state %lu\n", con, con->state); if (con->state != CON_STATE_CONNECTING && con->state != CON_STATE_NEGOTIATING && con->state != CON_STATE_OPEN) return 0; BUG_ON(!con->sock); dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, con->in_base_pos); if (con->state == CON_STATE_CONNECTING) { dout("try_read connecting\n"); ret = read_partial_banner(con); if (ret <= 0) goto out; ret = process_banner(con); if (ret < 0) goto out; con->state = CON_STATE_NEGOTIATING; /* * Received banner is good, exchange connection info. * Do not reset out_kvec, as sending our banner raced * with receiving peer banner after connect completed. */ ret = prepare_write_connect(con); if (ret < 0) goto out; prepare_read_connect(con); /* Send connection info before awaiting response */ goto out; } if (con->state == CON_STATE_NEGOTIATING) { dout("try_read negotiating\n"); ret = read_partial_connect(con); if (ret <= 0) goto out; ret = process_connect(con); if (ret < 0) goto out; goto more; } WARN_ON(con->state != CON_STATE_OPEN); if (con->in_base_pos < 0) { /* * skipping + discarding content. */ ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos); if (ret <= 0) goto out; dout("skipped %d / %d bytes\n", ret, -con->in_base_pos); con->in_base_pos += ret; if (con->in_base_pos) goto more; } if (con->in_tag == CEPH_MSGR_TAG_READY) { /* * what's next? */ ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); if (ret <= 0) goto out; dout("try_read got tag %d\n", (int)con->in_tag); switch (con->in_tag) { case CEPH_MSGR_TAG_MSG: prepare_read_message(con); break; case CEPH_MSGR_TAG_ACK: prepare_read_ack(con); break; case CEPH_MSGR_TAG_KEEPALIVE2_ACK: prepare_read_keepalive_ack(con); break; case CEPH_MSGR_TAG_CLOSE: con_close_socket(con); con->state = CON_STATE_CLOSED; goto out; default: goto bad_tag; } } if (con->in_tag == CEPH_MSGR_TAG_MSG) { ret = read_partial_message(con); if (ret <= 0) { switch (ret) { case -EBADMSG: con->error_msg = "bad crc/signature"; /* fall through */ case -EBADE: ret = -EIO; break; case -EIO: con->error_msg = "io error"; break; } goto out; } if (con->in_tag == CEPH_MSGR_TAG_READY) goto more; process_message(con); if (con->state == CON_STATE_OPEN) prepare_read_tag(con); goto more; } if (con->in_tag == CEPH_MSGR_TAG_ACK || con->in_tag == CEPH_MSGR_TAG_SEQ) { /* * the final handshake seq exchange is semantically * equivalent to an ACK */ ret = read_partial_ack(con); if (ret <= 0) goto out; process_ack(con); goto more; } if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { ret = read_keepalive_ack(con); if (ret <= 0) goto out; goto more; } out: dout("try_read done on %p ret %d\n", con, ret); return ret; bad_tag: pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag); con->error_msg = "protocol error, garbage tag"; ret = -1; goto out; } /* * Atomically queue work on a connection after the specified delay. * Bump @con reference to avoid races with connection teardown. * Returns 0 if work was queued, or an error code otherwise. */ static int queue_con_delay(struct ceph_connection *con, unsigned long delay) { if (!con->ops->get(con)) { dout("%s %p ref count 0\n", __func__, con); return -ENOENT; } if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { dout("%s %p - already queued\n", __func__, con); con->ops->put(con); return -EBUSY; } dout("%s %p %lu\n", __func__, con, delay); return 0; } static void queue_con(struct ceph_connection *con) { (void) queue_con_delay(con, 0); } static void cancel_con(struct ceph_connection *con) { if (cancel_delayed_work(&con->work)) { dout("%s %p\n", __func__, con); con->ops->put(con); } } static bool con_sock_closed(struct ceph_connection *con) { if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED)) return false; #define CASE(x) \ case CON_STATE_ ## x: \ con->error_msg = "socket closed (con state " #x ")"; \ break; switch (con->state) { CASE(CLOSED); CASE(PREOPEN); CASE(CONNECTING); CASE(NEGOTIATING); CASE(OPEN); CASE(STANDBY); default: pr_warn("%s con %p unrecognized state %lu\n", __func__, con, con->state); con->error_msg = "unrecognized con state"; BUG(); break; } #undef CASE return true; } static bool con_backoff(struct ceph_connection *con) { int ret; if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF)) return false; ret = queue_con_delay(con, round_jiffies_relative(con->delay)); if (ret) { dout("%s: con %p FAILED to back off %lu\n", __func__, con, con->delay); BUG_ON(ret == -ENOENT); con_flag_set(con, CON_FLAG_BACKOFF); } return true; } /* Finish fault handling; con->mutex must *not* be held here */ static void con_fault_finish(struct ceph_connection *con) { dout("%s %p\n", __func__, con); /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ if (con->auth_retry) { dout("auth_retry %d, invalidating\n", con->auth_retry); if (con->ops->invalidate_authorizer) con->ops->invalidate_authorizer(con); con->auth_retry = 0; } if (con->ops->fault) con->ops->fault(con); } /* * Do some work on a connection. Drop a connection ref when we're done. */ static void ceph_con_workfn(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); bool fault; mutex_lock(&con->mutex); while (true) { int ret; if ((fault = con_sock_closed(con))) { dout("%s: con %p SOCK_CLOSED\n", __func__, con); break; } if (con_backoff(con)) { dout("%s: con %p BACKOFF\n", __func__, con); break; } if (con->state == CON_STATE_STANDBY) { dout("%s: con %p STANDBY\n", __func__, con); break; } if (con->state == CON_STATE_CLOSED) { dout("%s: con %p CLOSED\n", __func__, con); BUG_ON(con->sock); break; } if (con->state == CON_STATE_PREOPEN) { dout("%s: con %p PREOPEN\n", __func__, con); BUG_ON(con->sock); } ret = try_read(con); if (ret < 0) { if (ret == -EAGAIN) continue; if (!con->error_msg) con->error_msg = "socket error on read"; fault = true; break; } ret = try_write(con); if (ret < 0) { if (ret == -EAGAIN) continue; if (!con->error_msg) con->error_msg = "socket error on write"; fault = true; } break; /* If we make it to here, we're done */ } if (fault) con_fault(con); mutex_unlock(&con->mutex); if (fault) con_fault_finish(con); con->ops->put(con); } /* * Generic error/fault handler. A retry mechanism is used with * exponential backoff */ static void con_fault(struct ceph_connection *con) { dout("fault %p state %lu to peer %s\n", con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); con->error_msg = NULL; WARN_ON(con->state != CON_STATE_CONNECTING && con->state != CON_STATE_NEGOTIATING && con->state != CON_STATE_OPEN); con_close_socket(con); if (con_flag_test(con, CON_FLAG_LOSSYTX)) { dout("fault on LOSSYTX channel, marking CLOSED\n"); con->state = CON_STATE_CLOSED; return; } if (con->in_msg) { BUG_ON(con->in_msg->con != con); ceph_msg_put(con->in_msg); con->in_msg = NULL; } if (con->out_msg) { BUG_ON(con->out_msg->con != con); ceph_msg_put(con->out_msg); con->out_msg = NULL; } /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue); /* If there are no messages queued or keepalive pending, place * the connection in a STANDBY state */ if (list_empty(&con->out_queue) && !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) { dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); con_flag_clear(con, CON_FLAG_WRITE_PENDING); con->state = CON_STATE_STANDBY; } else { /* retry after a delay. */ con->state = CON_STATE_PREOPEN; if (con->delay == 0) con->delay = BASE_DELAY_INTERVAL; else if (con->delay < MAX_DELAY_INTERVAL) con->delay *= 2; con_flag_set(con, CON_FLAG_BACKOFF); queue_con(con); } } /* * initialize a new messenger instance */ void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr) { spin_lock_init(&msgr->global_seq_lock); if (myaddr) msgr->inst.addr = *myaddr; /* select a random nonce */ msgr->inst.addr.type = 0; get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); encode_my_addr(msgr); atomic_set(&msgr->stopping, 0); write_pnet(&msgr->net, get_net(current->nsproxy->net_ns)); dout("%s %p\n", __func__, msgr); } EXPORT_SYMBOL(ceph_messenger_init); void ceph_messenger_fini(struct ceph_messenger *msgr) { put_net(read_pnet(&msgr->net)); } EXPORT_SYMBOL(ceph_messenger_fini); static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con) { if (msg->con) msg->con->ops->put(msg->con); msg->con = con ? con->ops->get(con) : NULL; BUG_ON(msg->con != con); } static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ if (con->state == CON_STATE_STANDBY) { dout("clear_standby %p and ++connect_seq\n", con); con->state = CON_STATE_PREOPEN; con->connect_seq++; WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING)); WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)); } } /* * Queue up an outgoing message on the given connection. */ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) { /* set src+dst */ msg->hdr.src = con->msgr->inst.name; BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); msg->needs_out_seq = true; mutex_lock(&con->mutex); if (con->state == CON_STATE_CLOSED) { dout("con_send %p closed, dropping %p\n", con, msg); ceph_msg_put(msg); mutex_unlock(&con->mutex); return; } msg_con_set(msg, con); BUG_ON(!list_empty(&msg->list_head)); list_add_tail(&msg->list_head, &con->out_queue); dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type), ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), le32_to_cpu(msg->hdr.front_len), le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len)); clear_standby(con); mutex_unlock(&con->mutex); /* if there wasn't anything waiting to send before, queue * new work */ if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_send); /* * Revoke a message that was previously queued for send */ void ceph_msg_revoke(struct ceph_msg *msg) { struct ceph_connection *con = msg->con; if (!con) { dout("%s msg %p null con\n", __func__, msg); return; /* Message not in our possession */ } mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { dout("%s %p msg %p - was on queue\n", __func__, con, msg); list_del_init(&msg->list_head); msg->hdr.seq = 0; ceph_msg_put(msg); } if (con->out_msg == msg) { BUG_ON(con->out_skip); /* footer */ if (con->out_msg_done) { con->out_skip += con_out_kvec_skip(con); } else { BUG_ON(!msg->data_length); con->out_skip += sizeof_footer(con); } /* data, middle, front */ if (msg->data_length) con->out_skip += msg->cursor.total_resid; if (msg->middle) con->out_skip += con_out_kvec_skip(con); con->out_skip += con_out_kvec_skip(con); dout("%s %p msg %p - was sending, will write %d skip %d\n", __func__, con, msg, con->out_kvec_bytes, con->out_skip); msg->hdr.seq = 0; con->out_msg = NULL; ceph_msg_put(msg); } mutex_unlock(&con->mutex); } /* * Revoke a message that we may be reading data into */ void ceph_msg_revoke_incoming(struct ceph_msg *msg) { struct ceph_connection *con = msg->con; if (!con) { dout("%s msg %p null con\n", __func__, msg); return; /* Message not in our possession */ } mutex_lock(&con->mutex); if (con->in_msg == msg) { unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len); unsigned int data_len = le32_to_cpu(con->in_hdr.data_len); /* skip rest of message */ dout("%s %p msg %p revoked\n", __func__, con, msg); con->in_base_pos = con->in_base_pos - sizeof(struct ceph_msg_header) - front_len - middle_len - data_len - sizeof(struct ceph_msg_footer); ceph_msg_put(con->in_msg); con->in_msg = NULL; con->in_tag = CEPH_MSGR_TAG_READY; con->in_seq++; } else { dout("%s %p in_msg %p msg %p no-op\n", __func__, con, con->in_msg, msg); } mutex_unlock(&con->mutex); } /* * Queue a keepalive byte to ensure the tcp connection is alive. */ void ceph_con_keepalive(struct ceph_connection *con) { dout("con_keepalive %p\n", con); mutex_lock(&con->mutex); clear_standby(con); con_flag_set(con, CON_FLAG_KEEPALIVE_PENDING); mutex_unlock(&con->mutex); if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) queue_con(con); } EXPORT_SYMBOL(ceph_con_keepalive); bool ceph_con_keepalive_expired(struct ceph_connection *con, unsigned long interval) { if (interval > 0 && (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { struct timespec64 now; struct timespec64 ts; ktime_get_real_ts64(&now); jiffies_to_timespec64(interval, &ts); ts = timespec64_add(con->last_keepalive_ack, ts); return timespec64_compare(&now, &ts) >= 0; } return false; } static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) { struct ceph_msg_data *data; if (WARN_ON(!ceph_msg_data_type_valid(type))) return NULL; data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS); if (!data) return NULL; data->type = type; INIT_LIST_HEAD(&data->links); return data; } static void ceph_msg_data_destroy(struct ceph_msg_data *data) { if (!data) return; WARN_ON(!list_empty(&data->links)); if (data->type == CEPH_MSG_DATA_PAGELIST) ceph_pagelist_release(data->pagelist); kmem_cache_free(ceph_msg_data_cache, data); } void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment) { struct ceph_msg_data *data; BUG_ON(!pages); BUG_ON(!length); data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); BUG_ON(!data); data->pages = pages; data->length = length; data->alignment = alignment & ~PAGE_MASK; list_add_tail(&data->links, &msg->data); msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_add_pages); void ceph_msg_data_add_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist) { struct ceph_msg_data *data; BUG_ON(!pagelist); BUG_ON(!pagelist->length); data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); BUG_ON(!data); data->pagelist = pagelist; list_add_tail(&data->links, &msg->data); msg->data_length += pagelist->length; } EXPORT_SYMBOL(ceph_msg_data_add_pagelist); #ifdef CONFIG_BLOCK void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, u32 length) { struct ceph_msg_data *data; data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); BUG_ON(!data); data->bio_pos = *bio_pos; data->bio_length = length; list_add_tail(&data->links, &msg->data); msg->data_length += length; } EXPORT_SYMBOL(ceph_msg_data_add_bio); #endif /* CONFIG_BLOCK */ void ceph_msg_data_add_bvecs(struct ceph_msg *msg, struct ceph_bvec_iter *bvec_pos) { struct ceph_msg_data *data; data = ceph_msg_data_create(CEPH_MSG_DATA_BVECS); BUG_ON(!data); data->bvec_pos = *bvec_pos; list_add_tail(&data->links, &msg->data); msg->data_length += bvec_pos->iter.bi_size; } EXPORT_SYMBOL(ceph_msg_data_add_bvecs); /* * construct a new message with given type, size * the new msg has a ref count of 1. */ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail) { struct ceph_msg *m; m = kmem_cache_zalloc(ceph_msg_cache, flags); if (m == NULL) goto out; m->hdr.type = cpu_to_le16(type); m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); m->hdr.front_len = cpu_to_le32(front_len); INIT_LIST_HEAD(&m->list_head); kref_init(&m->kref); INIT_LIST_HEAD(&m->data); /* front */ if (front_len) { m->front.iov_base = ceph_kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); goto out2; } } else { m->front.iov_base = NULL; } m->front_alloc_len = m->front.iov_len = front_len; dout("ceph_msg_new %p front %d\n", m, front_len); return m; out2: ceph_msg_put(m); out: if (!can_fail) { pr_err("msg_new can't create type %d front %d\n", type, front_len); WARN_ON(1); } else { dout("msg_new can't create type %d front %d\n", type, front_len); } return NULL; } EXPORT_SYMBOL(ceph_msg_new); /* * Allocate "middle" portion of a message, if it is needed and wasn't * allocated by alloc_msg. This allows us to read a small fixed-size * per-type header in the front and then gracefully fail (i.e., * propagate the error to the caller based on info in the front) when * the middle is too large. */ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) { int type = le16_to_cpu(msg->hdr.type); int middle_len = le32_to_cpu(msg->hdr.middle_len); dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, ceph_msg_type_name(type), middle_len); BUG_ON(!middle_len); BUG_ON(msg->middle); msg->middle = ceph_buffer_new(middle_len, GFP_NOFS); if (!msg->middle) return -ENOMEM; return 0; } /* * Allocate a message for receiving an incoming message on a * connection, and save the result in con->in_msg. Uses the * connection's private alloc_msg op if available. * * Returns 0 on success, or a negative error code. * * On success, if we set *skip = 1: * - the next message should be skipped and ignored. * - con->in_msg == NULL * or if we set *skip = 0: * - con->in_msg is non-null. * On error (ENOMEM, EAGAIN, ...), * - con->in_msg == NULL */ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) { struct ceph_msg_header *hdr = &con->in_hdr; int middle_len = le32_to_cpu(hdr->middle_len); struct ceph_msg *msg; int ret = 0; BUG_ON(con->in_msg != NULL); BUG_ON(!con->ops->alloc_msg); mutex_unlock(&con->mutex); msg = con->ops->alloc_msg(con, hdr, skip); mutex_lock(&con->mutex); if (con->state != CON_STATE_OPEN) { if (msg) ceph_msg_put(msg); return -EAGAIN; } if (msg) { BUG_ON(*skip); msg_con_set(msg, con); con->in_msg = msg; } else { /* * Null message pointer means either we should skip * this message or we couldn't allocate memory. The * former is not an error. */ if (*skip) return 0; con->error_msg = "error allocating memory for incoming message"; return -ENOMEM; } memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); if (middle_len && !con->in_msg->middle) { ret = ceph_alloc_middle(con, con->in_msg); if (ret < 0) { ceph_msg_put(con->in_msg); con->in_msg = NULL; } } return ret; } /* * Free a generically kmalloc'd message. */ static void ceph_msg_free(struct ceph_msg *m) { dout("%s %p\n", __func__, m); kvfree(m->front.iov_base); kmem_cache_free(ceph_msg_cache, m); } static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); struct ceph_msg_data *data, *next; dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); msg_con_set(m, NULL); /* drop middle, data, if any */ if (m->middle) { ceph_buffer_put(m->middle); m->middle = NULL; } list_for_each_entry_safe(data, next, &m->data, links) { list_del_init(&data->links); ceph_msg_data_destroy(data); } m->data_length = 0; if (m->pool) ceph_msgpool_put(m->pool, m); else ceph_msg_free(m); } struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) { dout("%s %p (was %d)\n", __func__, msg, kref_read(&msg->kref)); kref_get(&msg->kref); return msg; } EXPORT_SYMBOL(ceph_msg_get); void ceph_msg_put(struct ceph_msg *msg) { dout("%s %p (was %d)\n", __func__, msg, kref_read(&msg->kref)); kref_put(&msg->kref, ceph_msg_release); } EXPORT_SYMBOL(ceph_msg_put); void ceph_msg_dump(struct ceph_msg *msg) { pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, msg->front_alloc_len, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); print_hex_dump(KERN_DEBUG, " front: ", DUMP_PREFIX_OFFSET, 16, 1, msg->front.iov_base, msg->front.iov_len, true); if (msg->middle) print_hex_dump(KERN_DEBUG, "middle: ", DUMP_PREFIX_OFFSET, 16, 1, msg->middle->vec.iov_base, msg->middle->vec.iov_len, true); print_hex_dump(KERN_DEBUG, "footer: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->footer, sizeof(msg->footer), true); } EXPORT_SYMBOL(ceph_msg_dump); |
149 149 14 51 149 149 149 149 149 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 | /* * Sound core. This file is composed of two parts. sound_class * which is common to both OSS and ALSA and OSS sound core which * is used OSS or emulation of it. */ /* * First, the common part. */ #include <linux/module.h> #include <linux/device.h> #include <linux/err.h> #include <linux/kdev_t.h> #include <linux/major.h> #include <sound/core.h> #ifdef CONFIG_SOUND_OSS_CORE static int __init init_oss_soundcore(void); static void cleanup_oss_soundcore(void); #else static inline int init_oss_soundcore(void) { return 0; } static inline void cleanup_oss_soundcore(void) { } #endif struct class *sound_class; EXPORT_SYMBOL(sound_class); MODULE_DESCRIPTION("Core sound module"); MODULE_AUTHOR("Alan Cox"); MODULE_LICENSE("GPL"); static char *sound_devnode(struct device *dev, umode_t *mode) { if (MAJOR(dev->devt) == SOUND_MAJOR) return NULL; return kasprintf(GFP_KERNEL, "snd/%s", dev_name(dev)); } static int __init init_soundcore(void) { int rc; rc = init_oss_soundcore(); if (rc) return rc; sound_class = class_create(THIS_MODULE, "sound"); if (IS_ERR(sound_class)) { cleanup_oss_soundcore(); return PTR_ERR(sound_class); } sound_class->devnode = sound_devnode; return 0; } static void __exit cleanup_soundcore(void) { cleanup_oss_soundcore(); class_destroy(sound_class); } subsys_initcall(init_soundcore); module_exit(cleanup_soundcore); #ifdef CONFIG_SOUND_OSS_CORE /* * OSS sound core handling. Breaks out sound functions to submodules * * Author: Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * -------------------- * * Top level handler for the sound subsystem. Various devices can * plug into this. The fact they don't all go via OSS doesn't mean * they don't have to implement the OSS API. There is a lot of logic * to keeping much of the OSS weight out of the code in a compatibility * module, but it's up to the driver to rember to load it... * * The code provides a set of functions for registration of devices * by type. This is done rather than providing a single call so that * we can hide any future changes in the internals (eg when we go to * 32bit dev_t) from the modules and their interface. * * Secondly we need to allocate the dsp, dsp16 and audio devices as * one. Thus we misuse the chains a bit to simplify this. * * Thirdly to make it more fun and for 2.3.x and above we do all * of this using fine grained locking. * * FIXME: we have to resolve modules and fine grained load/unload * locking at some point in 2.3.x. */ #include <linux/init.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sound.h> #include <linux/kmod.h> #define SOUND_STEP 16 struct sound_unit { int unit_minor; const struct file_operations *unit_fops; struct sound_unit *next; char name[32]; }; /* * By default, OSS sound_core claims full legacy minor range (0-255) * of SOUND_MAJOR to trap open attempts to any sound minor and * requests modules using custom sound-slot/service-* module aliases. * The only benefit of doing this is allowing use of custom module * aliases instead of the standard char-major-* ones. This behavior * prevents alternative OSS implementation and is scheduled to be * removed. * * CONFIG_SOUND_OSS_CORE_PRECLAIM and soundcore.preclaim_oss kernel * parameter are added to allow distros and developers to try and * switch to alternative implementations without needing to rebuild * the kernel in the meantime. If preclaim_oss is non-zero, the * kernel will behave the same as before. All SOUND_MAJOR minors are * preclaimed and the custom module aliases along with standard chrdev * ones are emitted if a missing device is opened. If preclaim_oss is * zero, sound_core only grabs what's actually in use and for missing * devices only the standard chrdev aliases are requested. * * All these clutters are scheduled to be removed along with * sound-slot/service-* module aliases. */ #ifdef CONFIG_SOUND_OSS_CORE_PRECLAIM static int preclaim_oss = 1; #else static int preclaim_oss = 0; #endif module_param(preclaim_oss, int, 0444); static int soundcore_open(struct inode *, struct file *); static const struct file_operations soundcore_fops = { /* We must have an owner or the module locking fails */ .owner = THIS_MODULE, .open = soundcore_open, .llseek = noop_llseek, }; /* * Low level list operator. Scan the ordered list, find a hole and * join into it. Called with the lock asserted */ static int __sound_insert_unit(struct sound_unit * s, struct sound_unit **list, const struct file_operations *fops, int index, int low, int top) { int n=low; if (index < 0) { /* first free */ while (*list && (*list)->unit_minor<n) list=&((*list)->next); while(n<top) { /* Found a hole ? */ if(*list==NULL || (*list)->unit_minor>n) break; list=&((*list)->next); n+=SOUND_STEP; } if(n>=top) return -ENOENT; } else { n = low+(index*16); while (*list) { if ((*list)->unit_minor==n) return -EBUSY; if ((*list)->unit_minor>n) break; list=&((*list)->next); } } /* * Fill it in */ s->unit_minor=n; s->unit_fops=fops; /* * Link it */ s->next=*list; *list=s; return n; } /* * Remove a node from the chain. Called with the lock asserted */ static struct sound_unit *__sound_remove_unit(struct sound_unit **list, int unit) { while(*list) { struct sound_unit *p=*list; if(p->unit_minor==unit) { *list=p->next; return p; } list=&(p->next); } printk(KERN_ERR "Sound device %d went missing!\n", unit); return NULL; } /* * This lock guards the sound loader list. */ static DEFINE_SPINLOCK(sound_loader_lock); /* * Allocate the controlling structure and add it to the sound driver * list. Acquires locks as needed */ static int sound_insert_unit(struct sound_unit **list, const struct file_operations *fops, int index, int low, int top, const char *name, umode_t mode, struct device *dev) { struct sound_unit *s = kmalloc(sizeof(*s), GFP_KERNEL); int r; if (!s) return -ENOMEM; spin_lock(&sound_loader_lock); retry: r = __sound_insert_unit(s, list, fops, index, low, top); spin_unlock(&sound_loader_lock); if (r < 0) goto fail; else if (r < SOUND_STEP) sprintf(s->name, "sound/%s", name); else sprintf(s->name, "sound/%s%d", name, r / SOUND_STEP); if (!preclaim_oss) { /* * Something else might have grabbed the minor. If * first free slot is requested, rescan with @low set * to the next unit; otherwise, -EBUSY. */ r = __register_chrdev(SOUND_MAJOR, s->unit_minor, 1, s->name, &soundcore_fops); if (r < 0) { spin_lock(&sound_loader_lock); __sound_remove_unit(list, s->unit_minor); if (index < 0) { low = s->unit_minor + SOUND_STEP; goto retry; } spin_unlock(&sound_loader_lock); r = -EBUSY; goto fail; } } device_create(sound_class, dev, MKDEV(SOUND_MAJOR, s->unit_minor), NULL, "%s", s->name+6); return s->unit_minor; fail: kfree(s); return r; } /* * Remove a unit. Acquires locks as needed. The drivers MUST have * completed the removal before their file operations become * invalid. */ static void sound_remove_unit(struct sound_unit **list, int unit) { struct sound_unit *p; spin_lock(&sound_loader_lock); p = __sound_remove_unit(list, unit); spin_unlock(&sound_loader_lock); if (p) { if (!preclaim_oss) __unregister_chrdev(SOUND_MAJOR, p->unit_minor, 1, p->name); device_destroy(sound_class, MKDEV(SOUND_MAJOR, p->unit_minor)); kfree(p); } } /* * Allocations * * 0 *16 Mixers * 1 *8 Sequencers * 2 *16 Midi * 3 *16 DSP * 4 *16 SunDSP * 5 *16 DSP16 * 6 -- sndstat (obsolete) * 7 *16 unused * 8 -- alternate sequencer (see above) * 9 *16 raw synthesizer access * 10 *16 unused * 11 *16 unused * 12 *16 unused * 13 *16 unused * 14 *16 unused * 15 *16 unused */ static struct sound_unit *chains[SOUND_STEP]; /** * register_sound_special_device - register a special sound node * @fops: File operations for the driver * @unit: Unit number to allocate * @dev: device pointer * * Allocate a special sound device by minor number from the sound * subsystem. * * Return: The allocated number is returned on success. On failure, * a negative error code is returned. */ int register_sound_special_device(const struct file_operations *fops, int unit, struct device *dev) { const int chain = unit % SOUND_STEP; int max_unit = 256; const char *name; char _name[16]; switch (chain) { case 0: name = "mixer"; break; case 1: name = "sequencer"; if (unit >= SOUND_STEP) goto __unknown; max_unit = unit + 1; break; case 2: name = "midi"; break; case 3: name = "dsp"; break; case 4: name = "audio"; break; case 5: name = "dspW"; break; case 8: name = "sequencer2"; if (unit >= SOUND_STEP) goto __unknown; max_unit = unit + 1; break; case 9: name = "dmmidi"; break; case 10: name = "dmfm"; break; case 12: name = "adsp"; break; case 13: name = "amidi"; break; case 14: name = "admmidi"; break; default: { __unknown: sprintf(_name, "unknown%d", chain); if (unit >= SOUND_STEP) strcat(_name, "-"); name = _name; } break; } return sound_insert_unit(&chains[chain], fops, -1, unit, max_unit, name, 0600, dev); } EXPORT_SYMBOL(register_sound_special_device); int register_sound_special(const struct file_operations *fops, int unit) { return register_sound_special_device(fops, unit, NULL); } EXPORT_SYMBOL(register_sound_special); /** * register_sound_mixer - register a mixer device * @fops: File operations for the driver * @dev: Unit number to allocate * * Allocate a mixer device. Unit is the number of the mixer requested. * Pass -1 to request the next free mixer unit. * * Return: On success, the allocated number is returned. On failure, * a negative error code is returned. */ int register_sound_mixer(const struct file_operations *fops, int dev) { return sound_insert_unit(&chains[0], fops, dev, 0, 128, "mixer", 0600, NULL); } EXPORT_SYMBOL(register_sound_mixer); /* * DSP's are registered as a triple. Register only one and cheat * in open - see below. */ /** * register_sound_dsp - register a DSP device * @fops: File operations for the driver * @dev: Unit number to allocate * * Allocate a DSP device. Unit is the number of the DSP requested. * Pass -1 to request the next free DSP unit. * * This function allocates both the audio and dsp device entries together * and will always allocate them as a matching pair - eg dsp3/audio3 * * Return: On success, the allocated number is returned. On failure, * a negative error code is returned. */ int register_sound_dsp(const struct file_operations *fops, int dev) { return sound_insert_unit(&chains[3], fops, dev, 3, 131, "dsp", 0600, NULL); } EXPORT_SYMBOL(register_sound_dsp); /** * unregister_sound_special - unregister a special sound device * @unit: unit number to allocate * * Release a sound device that was allocated with * register_sound_special(). The unit passed is the return value from * the register function. */ void unregister_sound_special(int unit) { sound_remove_unit(&chains[unit % SOUND_STEP], unit); } EXPORT_SYMBOL(unregister_sound_special); /** * unregister_sound_mixer - unregister a mixer * @unit: unit number to allocate * * Release a sound device that was allocated with register_sound_mixer(). * The unit passed is the return value from the register function. */ void unregister_sound_mixer(int unit) { sound_remove_unit(&chains[0], unit); } EXPORT_SYMBOL(unregister_sound_mixer); /** * unregister_sound_dsp - unregister a DSP device * @unit: unit number to allocate * * Release a sound device that was allocated with register_sound_dsp(). * The unit passed is the return value from the register function. * * Both of the allocated units are released together automatically. */ void unregister_sound_dsp(int unit) { sound_remove_unit(&chains[3], unit); } EXPORT_SYMBOL(unregister_sound_dsp); static struct sound_unit *__look_for_unit(int chain, int unit) { struct sound_unit *s; s=chains[chain]; while(s && s->unit_minor <= unit) { if(s->unit_minor==unit) return s; s=s->next; } return NULL; } static int soundcore_open(struct inode *inode, struct file *file) { int chain; int unit = iminor(inode); struct sound_unit *s; const struct file_operations *new_fops = NULL; chain=unit&0x0F; if(chain==4 || chain==5) /* dsp/audio/dsp16 */ { unit&=0xF0; unit|=3; chain=3; } spin_lock(&sound_loader_lock); s = __look_for_unit(chain, unit); if (s) new_fops = fops_get(s->unit_fops); if (preclaim_oss && !new_fops) { spin_unlock(&sound_loader_lock); /* * Please, don't change this order or code. * For ALSA slot means soundcard and OSS emulation code * comes as add-on modules which aren't depend on * ALSA toplevel modules for soundcards, thus we need * load them at first. [Jaroslav Kysela <perex@jcu.cz>] */ request_module("sound-slot-%i", unit>>4); request_module("sound-service-%i-%i", unit>>4, chain); /* * sound-slot/service-* module aliases are scheduled * for removal in favor of the standard char-major-* * module aliases. For the time being, generate both * the legacy and standard module aliases to ease * transition. */ if (request_module("char-major-%d-%d", SOUND_MAJOR, unit) > 0) request_module("char-major-%d", SOUND_MAJOR); spin_lock(&sound_loader_lock); s = __look_for_unit(chain, unit); if (s) new_fops = fops_get(s->unit_fops); } spin_unlock(&sound_loader_lock); if (new_fops) { /* * We rely upon the fact that we can't be unloaded while the * subdriver is there. */ int err = 0; replace_fops(file, new_fops); if (file->f_op->open) err = file->f_op->open(inode,file); return err; } return -ENODEV; } MODULE_ALIAS_CHARDEV_MAJOR(SOUND_MAJOR); static void cleanup_oss_soundcore(void) { /* We have nothing to really do here - we know the lists must be empty */ unregister_chrdev(SOUND_MAJOR, "sound"); } static int __init init_oss_soundcore(void) { if (preclaim_oss && register_chrdev(SOUND_MAJOR, "sound", &soundcore_fops) < 0) { printk(KERN_ERR "soundcore: sound device already in use.\n"); return -EBUSY; } return 0; } #endif /* CONFIG_SOUND_OSS_CORE */ |
44 39 34 38 44 42 43 39 38 32 3 33 33 14 33 14 38 38 43 44 44 6 44 1 1 1 1 4 27 27 27 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 | // SPDX-License-Identifier: GPL-2.0+ /* * mdt.c - meta data file for NILFS * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Written by Ryusuke Konishi. */ #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/mm.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/swap.h> #include <linux/slab.h> #include "nilfs.h" #include "btnode.h" #include "segment.h" #include "page.h" #include "mdt.h" #include "alloc.h" /* nilfs_palloc_destroy_cache() */ #include <trace/events/nilfs2.h> #define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) static int nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, struct buffer_head *bh, void (*init_block)(struct inode *, struct buffer_head *, void *)) { struct nilfs_inode_info *ii = NILFS_I(inode); void *kaddr; int ret; /* Caller exclude read accesses using page lock */ /* set_buffer_new(bh); */ bh->b_blocknr = 0; ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh); if (unlikely(ret)) return ret; set_buffer_mapped(bh); kaddr = kmap_atomic(bh->b_page); memset(kaddr + bh_offset(bh), 0, i_blocksize(inode)); if (init_block) init_block(inode, bh, kaddr); flush_dcache_page(bh->b_page); kunmap_atomic(kaddr); set_buffer_uptodate(bh); mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(inode); trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block); return 0; } static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, struct buffer_head **out_bh, void (*init_block)(struct inode *, struct buffer_head *, void *)) { struct super_block *sb = inode->i_sb; struct nilfs_transaction_info ti; struct buffer_head *bh; int err; nilfs_transaction_begin(sb, &ti, 0); err = -ENOMEM; bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0); if (unlikely(!bh)) goto failed_unlock; err = -EEXIST; if (buffer_uptodate(bh)) goto failed_bh; wait_on_buffer(bh); if (buffer_uptodate(bh)) goto failed_bh; bh->b_bdev = sb->s_bdev; err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); if (likely(!err)) { get_bh(bh); *out_bh = bh; } failed_bh: unlock_page(bh->b_page); put_page(bh->b_page); brelse(bh); failed_unlock: if (likely(!err)) err = nilfs_transaction_commit(sb); else nilfs_transaction_abort(sb); return err; } static int nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, int mode, int mode_flags, struct buffer_head **out_bh) { struct buffer_head *bh; __u64 blknum = 0; int ret = -ENOMEM; bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); if (unlikely(!bh)) goto failed; ret = -EEXIST; /* internal code */ if (buffer_uptodate(bh)) goto out; if (mode_flags & REQ_RAHEAD) { if (!trylock_buffer(bh)) { ret = -EBUSY; goto failed_bh; } } else /* mode == READ */ lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); goto out; } ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum); if (unlikely(ret)) { unlock_buffer(bh); goto failed_bh; } map_bh(bh, inode->i_sb, (sector_t)blknum); bh->b_end_io = end_buffer_read_sync; get_bh(bh); submit_bh(mode, mode_flags, bh); ret = 0; trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode); out: get_bh(bh); *out_bh = bh; failed_bh: unlock_page(bh->b_page); put_page(bh->b_page); brelse(bh); failed: return ret; } static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, int readahead, struct buffer_head **out_bh) { struct buffer_head *first_bh, *bh; unsigned long blkoff; int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS; int err; err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, 0, &first_bh); if (err == -EEXIST) /* internal code */ goto out; if (unlikely(err)) goto failed; if (readahead) { blkoff = block + 1; for (i = 0; i < nr_ra_blocks; i++, blkoff++) { err = nilfs_mdt_submit_block(inode, blkoff, REQ_OP_READ, REQ_RAHEAD, &bh); if (likely(!err || err == -EEXIST)) brelse(bh); else if (err != -EBUSY) break; /* abort readahead if bmap lookup failed */ if (!buffer_locked(first_bh)) goto out_no_wait; } } wait_on_buffer(first_bh); out_no_wait: err = -EIO; if (!buffer_uptodate(first_bh)) { nilfs_msg(inode->i_sb, KERN_ERR, "I/O error reading meta-data file (ino=%lu, block-offset=%lu)", inode->i_ino, block); goto failed_bh; } out: *out_bh = first_bh; return 0; failed_bh: brelse(first_bh); failed: return err; } /** * nilfs_mdt_get_block - read or create a buffer on meta data file. * @inode: inode of the meta data file * @blkoff: block offset * @create: create flag * @init_block: initializer used for newly allocated block * @out_bh: output of a pointer to the buffer_head * * nilfs_mdt_get_block() looks up the specified buffer and tries to create * a new buffer if @create is not zero. On success, the returned buffer is * assured to be either existing or formatted using a buffer lock on success. * @out_bh is substituted only when zero is returned. * * Return Value: On success, it returns 0. On error, the following negative * error code is returned. * * %-ENOMEM - Insufficient memory available. * * %-EIO - I/O error * * %-ENOENT - the specified block does not exist (hole block) * * %-EROFS - Read only filesystem (for create mode) */ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, void (*init_block)(struct inode *, struct buffer_head *, void *), struct buffer_head **out_bh) { int ret; /* Should be rewritten with merging nilfs_mdt_read_block() */ retry: ret = nilfs_mdt_read_block(inode, blkoff, !create, out_bh); if (!create || ret != -ENOENT) return ret; ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block); if (unlikely(ret == -EEXIST)) { /* create = 0; */ /* limit read-create loop retries */ goto retry; } return ret; } /** * nilfs_mdt_find_block - find and get a buffer on meta data file. * @inode: inode of the meta data file * @start: start block offset (inclusive) * @end: end block offset (inclusive) * @blkoff: block offset * @out_bh: place to store a pointer to buffer_head struct * * nilfs_mdt_find_block() looks up an existing block in range of * [@start, @end] and stores pointer to a buffer head of the block to * @out_bh, and block offset to @blkoff, respectively. @out_bh and * @blkoff are substituted only when zero is returned. * * Return Value: On success, it returns 0. On error, the following negative * error code is returned. * * %-ENOMEM - Insufficient memory available. * * %-EIO - I/O error * * %-ENOENT - no block was found in the range */ int nilfs_mdt_find_block(struct inode *inode, unsigned long start, unsigned long end, unsigned long *blkoff, struct buffer_head **out_bh) { __u64 next; int ret; if (unlikely(start > end)) return -ENOENT; ret = nilfs_mdt_read_block(inode, start, true, out_bh); if (!ret) { *blkoff = start; goto out; } if (unlikely(ret != -ENOENT || start == ULONG_MAX)) goto out; ret = nilfs_bmap_seek_key(NILFS_I(inode)->i_bmap, start + 1, &next); if (!ret) { if (next <= end) { ret = nilfs_mdt_read_block(inode, next, true, out_bh); if (!ret) *blkoff = next; } else { ret = -ENOENT; } } out: return ret; } /** * nilfs_mdt_delete_block - make a hole on the meta data file. * @inode: inode of the meta data file * @block: block offset * * Return Value: On success, zero is returned. * On error, one of the following negative error code is returned. * * %-ENOMEM - Insufficient memory available. * * %-EIO - I/O error */ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) { struct nilfs_inode_info *ii = NILFS_I(inode); int err; err = nilfs_bmap_delete(ii->i_bmap, block); if (!err || err == -ENOENT) { nilfs_mdt_mark_dirty(inode); nilfs_mdt_forget_block(inode, block); } return err; } /** * nilfs_mdt_forget_block - discard dirty state and try to remove the page * @inode: inode of the meta data file * @block: block offset * * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and * tries to release the page including the buffer from a page cache. * * Return Value: On success, 0 is returned. On error, one of the following * negative error code is returned. * * %-EBUSY - page has an active buffer. * * %-ENOENT - page cache has no page addressed by the offset. */ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) { pgoff_t index = (pgoff_t)block >> (PAGE_SHIFT - inode->i_blkbits); struct page *page; unsigned long first_block; int ret = 0; int still_dirty; page = find_lock_page(inode->i_mapping, index); if (!page) return -ENOENT; wait_on_page_writeback(page); first_block = (unsigned long)index << (PAGE_SHIFT - inode->i_blkbits); if (page_has_buffers(page)) { struct buffer_head *bh; bh = nilfs_page_get_nth_block(page, block - first_block); nilfs_forget_buffer(bh); } still_dirty = PageDirty(page); unlock_page(page); put_page(page); if (still_dirty || invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0) ret = -EBUSY; return ret; } int nilfs_mdt_fetch_dirty(struct inode *inode) { struct nilfs_inode_info *ii = NILFS_I(inode); if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) { set_bit(NILFS_I_DIRTY, &ii->i_state); return 1; } return test_bit(NILFS_I_DIRTY, &ii->i_state); } static int nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; struct super_block *sb; int err = 0; if (inode && sb_rdonly(inode->i_sb)) { /* * It means that filesystem was remounted in read-only * mode because of error or metadata corruption. But we * have dirty pages that try to be flushed in background. * So, here we simply discard this dirty page. */ nilfs_clear_dirty_page(page, false); unlock_page(page); return -EROFS; } redirty_page_for_writepage(wbc, page); unlock_page(page); if (!inode) return 0; sb = inode->i_sb; if (wbc->sync_mode == WB_SYNC_ALL) err = nilfs_construct_segment(sb); else if (wbc->for_reclaim) nilfs_flush_segment(sb, inode->i_ino); return err; } static const struct address_space_operations def_mdt_aops = { .writepage = nilfs_mdt_write_page, }; static const struct inode_operations def_mdt_iops; static const struct file_operations def_mdt_fops; int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz) { struct nilfs_mdt_info *mi; mi = kzalloc(max(sizeof(*mi), objsz), GFP_NOFS); if (!mi) return -ENOMEM; init_rwsem(&mi->mi_sem); inode->i_private = mi; inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, gfp_mask); inode->i_op = &def_mdt_iops; inode->i_fop = &def_mdt_fops; inode->i_mapping->a_ops = &def_mdt_aops; return 0; } /** * nilfs_mdt_clear - do cleanup for the metadata file * @inode: inode of the metadata file */ void nilfs_mdt_clear(struct inode *inode) { struct nilfs_mdt_info *mdi = NILFS_MDT(inode); struct nilfs_shadow_map *shadow = mdi->mi_shadow; if (mdi->mi_palloc_cache) nilfs_palloc_destroy_cache(inode); if (shadow) { struct inode *s_inode = shadow->inode; shadow->inode = NULL; iput(s_inode); mdi->mi_shadow = NULL; } } /** * nilfs_mdt_destroy - release resources used by the metadata file * @inode: inode of the metadata file */ void nilfs_mdt_destroy(struct inode *inode) { struct nilfs_mdt_info *mdi = NILFS_MDT(inode); kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ kfree(mdi); } void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size, unsigned int header_size) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); mi->mi_entry_size = entry_size; mi->mi_entries_per_block = i_blocksize(inode) / entry_size; mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); } /** * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file * @inode: inode of the metadata file * @shadow: shadow mapping */ int nilfs_mdt_setup_shadow_map(struct inode *inode, struct nilfs_shadow_map *shadow) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); struct inode *s_inode; INIT_LIST_HEAD(&shadow->frozen_buffers); s_inode = nilfs_iget_for_shadow(inode); if (IS_ERR(s_inode)) return PTR_ERR(s_inode); shadow->inode = s_inode; mi->mi_shadow = shadow; return 0; } /** * nilfs_mdt_save_to_shadow_map - copy bmap and dirty pages to shadow map * @inode: inode of the metadata file */ int nilfs_mdt_save_to_shadow_map(struct inode *inode) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); struct nilfs_inode_info *ii = NILFS_I(inode); struct nilfs_shadow_map *shadow = mi->mi_shadow; struct inode *s_inode = shadow->inode; int ret; ret = nilfs_copy_dirty_pages(s_inode->i_mapping, inode->i_mapping); if (ret) goto out; ret = nilfs_copy_dirty_pages(NILFS_I(s_inode)->i_assoc_inode->i_mapping, ii->i_assoc_inode->i_mapping); if (ret) goto out; nilfs_bmap_save(ii->i_bmap, &shadow->bmap_store); out: return ret; } int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) { struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; struct buffer_head *bh_frozen; struct page *page; int blkbits = inode->i_blkbits; page = grab_cache_page(shadow->inode->i_mapping, bh->b_page->index); if (!page) return -ENOMEM; if (!page_has_buffers(page)) create_empty_buffers(page, 1 << blkbits, 0); bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits); if (!buffer_uptodate(bh_frozen)) nilfs_copy_buffer(bh_frozen, bh); if (list_empty(&bh_frozen->b_assoc_buffers)) { list_add_tail(&bh_frozen->b_assoc_buffers, &shadow->frozen_buffers); set_buffer_nilfs_redirected(bh); } else { brelse(bh_frozen); /* already frozen */ } unlock_page(page); put_page(page); return 0; } struct buffer_head * nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) { struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; struct buffer_head *bh_frozen = NULL; struct page *page; int n; page = find_lock_page(shadow->inode->i_mapping, bh->b_page->index); if (page) { if (page_has_buffers(page)) { n = bh_offset(bh) >> inode->i_blkbits; bh_frozen = nilfs_page_get_nth_block(page, n); } unlock_page(page); put_page(page); } return bh_frozen; } static void nilfs_release_frozen_buffers(struct nilfs_shadow_map *shadow) { struct list_head *head = &shadow->frozen_buffers; struct buffer_head *bh; while (!list_empty(head)) { bh = list_first_entry(head, struct buffer_head, b_assoc_buffers); list_del_init(&bh->b_assoc_buffers); brelse(bh); /* drop ref-count to make it releasable */ } } /** * nilfs_mdt_restore_from_shadow_map - restore dirty pages and bmap state * @inode: inode of the metadata file */ void nilfs_mdt_restore_from_shadow_map(struct inode *inode) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); struct nilfs_inode_info *ii = NILFS_I(inode); struct nilfs_shadow_map *shadow = mi->mi_shadow; down_write(&mi->mi_sem); if (mi->mi_palloc_cache) nilfs_palloc_clear_cache(inode); nilfs_clear_dirty_pages(inode->i_mapping, true); nilfs_copy_back_pages(inode->i_mapping, shadow->inode->i_mapping); nilfs_clear_dirty_pages(ii->i_assoc_inode->i_mapping, true); nilfs_copy_back_pages(ii->i_assoc_inode->i_mapping, NILFS_I(shadow->inode)->i_assoc_inode->i_mapping); nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store); up_write(&mi->mi_sem); } /** * nilfs_mdt_clear_shadow_map - truncate pages in shadow map caches * @inode: inode of the metadata file */ void nilfs_mdt_clear_shadow_map(struct inode *inode) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); struct nilfs_shadow_map *shadow = mi->mi_shadow; struct inode *shadow_btnc_inode = NILFS_I(shadow->inode)->i_assoc_inode; down_write(&mi->mi_sem); nilfs_release_frozen_buffers(shadow); truncate_inode_pages(shadow->inode->i_mapping, 0); truncate_inode_pages(shadow_btnc_inode->i_mapping, 0); up_write(&mi->mi_sem); } |
86 86 86 86 86 86 86 86 58 58 58 86 86 58 58 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * * This file contains the core interrupt handling code, for irq-chip based * architectures. Detailed information is available in * Documentation/core-api/genericirq.rst */ #include <linux/irq.h> #include <linux/msi.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/irqdomain.h> #include <trace/events/irq.h> #include "internals.h" static irqreturn_t bad_chained_irq(int irq, void *dev_id) { WARN_ONCE(1, "Chained irq %d should not call an action\n", irq); return IRQ_NONE; } /* * Chained handlers should never call action on their IRQ. This default * action will emit warning if such thing happens. */ struct irqaction chained_action = { .handler = bad_chained_irq, }; /** * irq_set_chip - set the irq chip for an irq * @irq: irq number * @chip: pointer to irq chip description structure */ int irq_set_chip(unsigned int irq, struct irq_chip *chip) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; if (!chip) chip = &no_irq_chip; desc->irq_data.chip = chip; irq_put_desc_unlock(desc, flags); /* * For !CONFIG_SPARSE_IRQ make the irq show up in * allocated_irqs. */ irq_mark_irq(irq); return 0; } EXPORT_SYMBOL(irq_set_chip); /** * irq_set_type - set the irq trigger type for an irq * @irq: irq number * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ int irq_set_irq_type(unsigned int irq, unsigned int type) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); int ret = 0; if (!desc) return -EINVAL; ret = __irq_set_trigger(desc, type); irq_put_desc_busunlock(desc, flags); return ret; } EXPORT_SYMBOL(irq_set_irq_type); /** * irq_set_handler_data - set irq handler data for an irq * @irq: Interrupt number * @data: Pointer to interrupt specific data * * Set the hardware irq controller data for an irq */ int irq_set_handler_data(unsigned int irq, void *data) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; desc->irq_common_data.handler_data = data; irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL(irq_set_handler_data); /** * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset * @irq_base: Interrupt number base * @irq_offset: Interrupt number offset * @entry: Pointer to MSI descriptor data * * Set the MSI descriptor entry for an irq at offset */ int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, struct msi_desc *entry) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; desc->irq_common_data.msi_desc = entry; if (entry && !irq_offset) entry->irq = irq_base; irq_put_desc_unlock(desc, flags); return 0; } /** * irq_set_msi_desc - set MSI descriptor data for an irq * @irq: Interrupt number * @entry: Pointer to MSI descriptor data * * Set the MSI descriptor entry for an irq */ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { return irq_set_msi_desc_off(irq, 0, entry); } /** * irq_set_chip_data - set irq chip data for an irq * @irq: Interrupt number * @data: Pointer to chip specific data * * Set the hardware irq chip data for an irq */ int irq_set_chip_data(unsigned int irq, void *data) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; desc->irq_data.chip_data = data; irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL(irq_set_chip_data); struct irq_data *irq_get_irq_data(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); return desc ? &desc->irq_data : NULL; } EXPORT_SYMBOL_GPL(irq_get_irq_data); static void irq_state_clr_disabled(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); } static void irq_state_clr_masked(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); } static void irq_state_clr_started(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED); } static void irq_state_set_started(struct irq_desc *desc) { irqd_set(&desc->irq_data, IRQD_IRQ_STARTED); } enum { IRQ_STARTUP_NORMAL, IRQ_STARTUP_MANAGED, IRQ_STARTUP_ABORT, }; #ifdef CONFIG_SMP static int __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) { struct irq_data *d = irq_desc_get_irq_data(desc); if (!irqd_affinity_is_managed(d)) return IRQ_STARTUP_NORMAL; irqd_clr_managed_shutdown(d); if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) { /* * Catch code which fiddles with enable_irq() on a managed * and potentially shutdown IRQ. Chained interrupt * installment or irq auto probing should not happen on * managed irqs either. */ if (WARN_ON_ONCE(force)) return IRQ_STARTUP_ABORT; /* * The interrupt was requested, but there is no online CPU * in it's affinity mask. Put it into managed shutdown * state and let the cpu hotplug mechanism start it up once * a CPU in the mask becomes available. */ return IRQ_STARTUP_ABORT; } /* * Managed interrupts have reserved resources, so this should not * happen. */ if (WARN_ON(irq_domain_activate_irq(d, false))) return IRQ_STARTUP_ABORT; return IRQ_STARTUP_MANAGED; } #else static __always_inline int __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force) { return IRQ_STARTUP_NORMAL; } #endif static int __irq_startup(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); int ret = 0; /* Warn if this interrupt is not activated but try nevertheless */ WARN_ON_ONCE(!irqd_is_activated(d)); if (d->chip->irq_startup) { ret = d->chip->irq_startup(d); irq_state_clr_disabled(desc); irq_state_clr_masked(desc); } else { irq_enable(desc); } irq_state_set_started(desc); return ret; } int irq_startup(struct irq_desc *desc, bool resend, bool force) { struct irq_data *d = irq_desc_get_irq_data(desc); struct cpumask *aff = irq_data_get_affinity_mask(d); int ret = 0; desc->depth = 0; if (irqd_is_started(d)) { irq_enable(desc); } else { switch (__irq_startup_managed(desc, aff, force)) { case IRQ_STARTUP_NORMAL: if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP) irq_setup_affinity(desc); ret = __irq_startup(desc); if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)) irq_setup_affinity(desc); break; case IRQ_STARTUP_MANAGED: irq_do_set_affinity(d, aff, false); ret = __irq_startup(desc); break; case IRQ_STARTUP_ABORT: irqd_set_managed_shutdown(d); return 0; } } if (resend) check_irq_resend(desc); return ret; } int irq_activate(struct irq_desc *desc) { struct irq_data *d = irq_desc_get_irq_data(desc); if (!irqd_affinity_is_managed(d)) return irq_domain_activate_irq(d, false); return 0; } int irq_activate_and_startup(struct irq_desc *desc, bool resend) { if (WARN_ON(irq_activate(desc))) return 0; return irq_startup(desc, resend, IRQ_START_FORCE); } static void __irq_disable(struct irq_desc *desc, bool mask); void irq_shutdown(struct irq_desc *desc) { if (irqd_is_started(&desc->irq_data)) { desc->depth = 1; if (desc->irq_data.chip->irq_shutdown) { desc->irq_data.chip->irq_shutdown(&desc->irq_data); irq_state_set_disabled(desc); irq_state_set_masked(desc); } else { __irq_disable(desc, true); } irq_state_clr_started(desc); } } void irq_shutdown_and_deactivate(struct irq_desc *desc) { irq_shutdown(desc); /* * This must be called even if the interrupt was never started up, * because the activation can happen before the interrupt is * available for request/startup. It has it's own state tracking so * it's safe to call it unconditionally. */ irq_domain_deactivate_irq(&desc->irq_data); } void irq_enable(struct irq_desc *desc) { if (!irqd_irq_disabled(&desc->irq_data)) { unmask_irq(desc); } else { irq_state_clr_disabled(desc); if (desc->irq_data.chip->irq_enable) { desc->irq_data.chip->irq_enable(&desc->irq_data); irq_state_clr_masked(desc); } else { unmask_irq(desc); } } } static void __irq_disable(struct irq_desc *desc, bool mask) { if (irqd_irq_disabled(&desc->irq_data)) { if (mask) mask_irq(desc); } else { irq_state_set_disabled(desc); if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); irq_state_set_masked(desc); } else if (mask) { mask_irq(desc); } } } /** * irq_disable - Mark interrupt disabled * @desc: irq descriptor which should be disabled * * If the chip does not implement the irq_disable callback, we * use a lazy disable approach. That means we mark the interrupt * disabled, but leave the hardware unmasked. That's an * optimization because we avoid the hardware access for the * common case where no interrupt happens after we marked it * disabled. If an interrupt happens, then the interrupt flow * handler masks the line at the hardware level and marks it * pending. * * If the interrupt chip does not implement the irq_disable callback, * a driver can disable the lazy approach for a particular irq line by * calling 'irq_set_status_flags(irq, IRQ_DISABLE_UNLAZY)'. This can * be used for devices which cannot disable the interrupt at the * device level under certain circumstances and have to use * disable_irq[_nosync] instead. */ void irq_disable(struct irq_desc *desc) { __irq_disable(desc, irq_settings_disable_unlazy(desc)); } void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) { if (desc->irq_data.chip->irq_enable) desc->irq_data.chip->irq_enable(&desc->irq_data); else desc->irq_data.chip->irq_unmask(&desc->irq_data); cpumask_set_cpu(cpu, desc->percpu_enabled); } void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu) { if (desc->irq_data.chip->irq_disable) desc->irq_data.chip->irq_disable(&desc->irq_data); else desc->irq_data.chip->irq_mask(&desc->irq_data); cpumask_clear_cpu(cpu, desc->percpu_enabled); } static inline void mask_ack_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask_ack) { desc->irq_data.chip->irq_mask_ack(&desc->irq_data); irq_state_set_masked(desc); } else { mask_irq(desc); if (desc->irq_data.chip->irq_ack) desc->irq_data.chip->irq_ack(&desc->irq_data); } } void mask_irq(struct irq_desc *desc) { if (irqd_irq_masked(&desc->irq_data)) return; if (desc->irq_data.chip->irq_mask) { desc->irq_data.chip->irq_mask(&desc->irq_data); irq_state_set_masked(desc); } } void unmask_irq(struct irq_desc *desc) { if (!irqd_irq_masked(&desc->irq_data)) return; if (desc->irq_data.chip->irq_unmask) { desc->irq_data.chip->irq_unmask(&desc->irq_data); irq_state_clr_masked(desc); } } void unmask_threaded_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; if (chip->flags & IRQCHIP_EOI_THREADED) chip->irq_eoi(&desc->irq_data); unmask_irq(desc); } /* * handle_nested_irq - Handle a nested irq from a irq thread * @irq: the interrupt number * * Handle interrupts which are nested into a threaded interrupt * handler. The handler function is called inside the calling * threads context. */ void handle_nested_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; irqreturn_t action_ret; might_sleep(); raw_spin_lock_irq(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); action = desc->action; if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; goto out_unlock; } kstat_incr_irqs_this_cpu(desc); irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); action_ret = IRQ_NONE; for_each_action_of_desc(desc, action) action_ret |= action->thread_fn(action->irq, action->dev_id); if (!noirqdebug) note_interrupt(desc, action_ret); raw_spin_lock_irq(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); out_unlock: raw_spin_unlock_irq(&desc->lock); } EXPORT_SYMBOL_GPL(handle_nested_irq); static bool irq_check_poll(struct irq_desc *desc) { if (!(desc->istate & IRQS_POLL_INPROGRESS)) return false; return irq_wait_for_poll(desc); } static bool irq_may_run(struct irq_desc *desc) { unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; /* * If the interrupt is not in progress and is not an armed * wakeup interrupt, proceed. */ if (!irqd_has_set(&desc->irq_data, mask)) return true; /* * If the interrupt is an armed wakeup source, mark it pending * and suspended, disable it and notify the pm core about the * event. */ if (irq_pm_check_wakeup(desc)) return false; /* * Handle a potential concurrent poll on a different core. */ return irq_check_poll(desc); } /** * handle_simple_irq - Simple and software-decoded IRQs. * @desc: the interrupt description structure for this irq * * Simple interrupts are either sent from a demultiplexing interrupt * handler or come from hardware, where no interrupt hardware control * is necessary. * * Note: The caller is expected to handle the ack, clear, mask and * unmask issues if necessary. */ void handle_simple_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; goto out_unlock; } kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); out_unlock: raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_simple_irq); /** * handle_untracked_irq - Simple and software-decoded IRQs. * @desc: the interrupt description structure for this irq * * Untracked interrupts are sent from a demultiplexing interrupt * handler when the demultiplexer does not know which device it its * multiplexed irq domain generated the interrupt. IRQ's handled * through here are not subjected to stats tracking, randomness, or * spurious interrupt detection. * * Note: Like handle_simple_irq, the caller is expected to handle * the ack, clear, mask and unmask issues if necessary. */ void handle_untracked_irq(struct irq_desc *desc) { unsigned int flags = 0; raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; goto out_unlock; } desc->istate &= ~IRQS_PENDING; irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); __handle_irq_event_percpu(desc, &flags); raw_spin_lock(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); out_unlock: raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_untracked_irq); /* * Called unconditionally from handle_level_irq() and only for oneshot * interrupts from handle_fasteoi_irq() */ static void cond_unmask_irq(struct irq_desc *desc) { /* * We need to unmask in the following cases: * - Standard level irq (IRQF_ONESHOT is not set) * - Oneshot irq which did not wake the thread (caused by a * spurious interrupt or a primary handler handling it * completely). */ if (!irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) unmask_irq(desc); } /** * handle_level_irq - Level type irq handler * @desc: the interrupt description structure for this irq * * Level type interrupts are active as long as the hardware line has * the active level. This may require to mask the interrupt and unmask * it after the associated handler has acknowledged the device, so the * interrupt line is back to inactive. */ void handle_level_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); mask_ack_irq(desc); if (!irq_may_run(desc)) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If its disabled or no action available * keep it masked and get out of here */ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; goto out_unlock; } kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); cond_unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_level_irq); #ifdef CONFIG_IRQ_PREFLOW_FASTEOI static inline void preflow_handler(struct irq_desc *desc) { if (desc->preflow_handler) desc->preflow_handler(&desc->irq_data); } #else static inline void preflow_handler(struct irq_desc *desc) { } #endif static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip) { if (!(desc->istate & IRQS_ONESHOT)) { chip->irq_eoi(&desc->irq_data); return; } /* * We need to unmask in the following cases: * - Oneshot irq which did not wake the thread (caused by a * spurious interrupt or a primary handler handling it * completely). */ if (!irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) { chip->irq_eoi(&desc->irq_data); unmask_irq(desc); } else if (!(chip->flags & IRQCHIP_EOI_THREADED)) { chip->irq_eoi(&desc->irq_data); } } /** * handle_fasteoi_irq - irq handler for transparent controllers * @desc: the interrupt description structure for this irq * * Only a single callback will be issued to the chip: an ->eoi() * call when the interrupt has been serviced. This enables support * for modern forms of interrupt handlers, which handle the flow * details in hardware, transparently. */ void handle_fasteoi_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If its disabled or no action available * then mask it and get out of here: */ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; } kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); raw_spin_unlock(&desc->lock); return; out: if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** * handle_edge_irq - edge type IRQ handler * @desc: the interrupt description structure for this irq * * Interrupt occures on the falling and/or rising edge of a hardware * signal. The occurrence is latched into the irq controller hardware * and must be acked in order to be reenabled. After the ack another * interrupt can happen on the same source even before the first one * is handled by the associated event handler. If this happens it * might be necessary to disable (mask) the interrupt depending on the * controller hardware. This requires to reenable the interrupt inside * of the loop which handles the interrupts which have arrived while * the handler was running. If all pending interrupts are handled, the * loop is left. */ void handle_edge_irq(struct irq_desc *desc) { raw_spin_lock(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); if (!irq_may_run(desc)) { desc->istate |= IRQS_PENDING; mask_ack_irq(desc); goto out_unlock; } /* * If its disabled or no action available then mask it and get * out of here. */ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { desc->istate |= IRQS_PENDING; mask_ack_irq(desc); goto out_unlock; } kstat_incr_irqs_this_cpu(desc); /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); do { if (unlikely(!desc->action)) { mask_irq(desc); goto out_unlock; } /* * When another irq arrived while we were handling * one, we could have masked the irq. * Renable it, if it was not disabled in meantime. */ if (unlikely(desc->istate & IRQS_PENDING)) { if (!irqd_irq_disabled(&desc->irq_data) && irqd_irq_masked(&desc->irq_data)) unmask_irq(desc); } handle_irq_event(desc); } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data)); out_unlock: raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL(handle_edge_irq); #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER /** * handle_edge_eoi_irq - edge eoi type IRQ handler * @desc: the interrupt description structure for this irq * * Similar as the above handle_edge_irq, but using eoi and w/o the * mask/unmask logic. */ void handle_edge_eoi_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); raw_spin_lock(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); if (!irq_may_run(desc)) { desc->istate |= IRQS_PENDING; goto out_eoi; } /* * If its disabled or no action available then mask it and get * out of here. */ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) { desc->istate |= IRQS_PENDING; goto out_eoi; } kstat_incr_irqs_this_cpu(desc); do { if (unlikely(!desc->action)) goto out_eoi; handle_irq_event(desc); } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data)); out_eoi: chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } #endif /** * handle_percpu_irq - Per CPU local irq handler * @desc: the interrupt description structure for this irq * * Per CPU interrupts on SMP machines without locking requirements */ void handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); /* * PER CPU interrupts are not serialized. Do not touch * desc->tot_count. */ __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); handle_irq_event_percpu(desc); if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); } /** * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids * @desc: the interrupt description structure for this irq * * Per CPU interrupts on SMP machines without locking requirements. Same as * handle_percpu_irq() above but with the following extras: * * action->percpu_dev_id is a pointer to percpu variables which * contain the real device id for the cpu on which this handler is * called */ void handle_percpu_devid_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irqaction *action = desc->action; unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; /* * PER CPU interrupts are not serialized. Do not touch * desc->tot_count. */ __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); if (likely(action)) { trace_irq_handler_entry(irq, action); res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); trace_irq_handler_exit(irq, action, res); } else { unsigned int cpu = smp_processor_id(); bool enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); if (enabled) irq_percpu_disable(desc, cpu); pr_err_once("Spurious%s percpu IRQ%u on CPU%u\n", enabled ? " and unmasked" : "", irq, cpu); } if (chip->irq_eoi) chip->irq_eoi(&desc->irq_data); } static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) { if (!handle) { handle = handle_bad_irq; } else { struct irq_data *irq_data = &desc->irq_data; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY /* * With hierarchical domains we might run into a * situation where the outermost chip is not yet set * up, but the inner chips are there. Instead of * bailing we install the handler, but obviously we * cannot enable/startup the interrupt at this point. */ while (irq_data) { if (irq_data->chip != &no_irq_chip) break; /* * Bail out if the outer chip is not set up * and the interrrupt supposed to be started * right away. */ if (WARN_ON(is_chained)) return; /* Try the parent */ irq_data = irq_data->parent_data; } #endif if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip)) return; } /* Uninstall? */ if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); irq_state_set_disabled(desc); if (is_chained) desc->action = NULL; desc->depth = 1; } desc->handle_irq = handle; desc->name = name; if (handle != handle_bad_irq && is_chained) { unsigned int type = irqd_get_trigger_type(&desc->irq_data); /* * We're about to start this interrupt immediately, * hence the need to set the trigger configuration. * But the .set_type callback may have overridden the * flow handler, ignoring that we're dealing with a * chained interrupt. Reset it immediately because we * do know better. */ if (type != IRQ_TYPE_NONE) { __irq_set_trigger(desc, type); desc->handle_irq = handle; } irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); desc->action = &chained_action; irq_activate_and_startup(desc, IRQ_RESEND); } } void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return; __irq_do_set_handler(desc, handle, is_chained, name); irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL_GPL(__irq_set_handler); void irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, void *data) { unsigned long flags; struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return; desc->irq_common_data.handler_data = data; __irq_do_set_handler(desc, handle, 1, NULL); irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data); void irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { irq_set_chip(irq, chip); __irq_set_handler(irq, handle, 0, name); } EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { unsigned long flags, trigger, tmp; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return; /* * Warn when a driver sets the no autoenable flag on an already * active interrupt. */ WARN_ON_ONCE(!desc->depth && (set & _IRQ_NOAUTOEN)); irq_settings_clr_and_set(desc, clr, set); trigger = irqd_get_trigger_type(&desc->irq_data); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); if (irq_settings_has_no_balance_set(desc)) irqd_set(&desc->irq_data, IRQD_NO_BALANCING); if (irq_settings_is_per_cpu(desc)) irqd_set(&desc->irq_data, IRQD_PER_CPU); if (irq_settings_can_move_pcntxt(desc)) irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); tmp = irq_settings_get_trigger_mask(desc); if (tmp != IRQ_TYPE_NONE) trigger = tmp; irqd_set(&desc->irq_data, trigger); irq_put_desc_unlock(desc, flags); } EXPORT_SYMBOL_GPL(irq_modify_status); /** * irq_cpu_online - Invoke all irq_cpu_online functions. * * Iterate through all irqs and invoke the chip.irq_cpu_online() * for each. */ void irq_cpu_online(void) { struct irq_desc *desc; struct irq_chip *chip; unsigned long flags; unsigned int irq; for_each_active_irq(irq) { desc = irq_to_desc(irq); if (!desc) continue; raw_spin_lock_irqsave(&desc->lock, flags); chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_online && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_online(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); } } /** * irq_cpu_offline - Invoke all irq_cpu_offline functions. * * Iterate through all irqs and invoke the chip.irq_cpu_offline() * for each. */ void irq_cpu_offline(void) { struct irq_desc *desc; struct irq_chip *chip; unsigned long flags; unsigned int irq; for_each_active_irq(irq) { desc = irq_to_desc(irq); if (!desc) continue; raw_spin_lock_irqsave(&desc->lock, flags); chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_offline && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_offline(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); } } #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY #ifdef CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS /** * handle_fasteoi_ack_irq - irq handler for edge hierarchy * stacked on transparent controllers * * @desc: the interrupt description structure for this irq * * Like handle_fasteoi_irq(), but for use with hierarchy where * the irq_chip also needs to have its ->irq_ack() function * called. */ void handle_fasteoi_ack_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If its disabled or no action available * then mask it and get out of here: */ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; } kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); raw_spin_unlock(&desc->lock); return; out: if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_ack_irq); /** * handle_fasteoi_mask_irq - irq handler for level hierarchy * stacked on transparent controllers * * @desc: the interrupt description structure for this irq * * Like handle_fasteoi_irq(), but for use with hierarchy where * the irq_chip also needs to have its ->irq_mask_ack() function * called. */ void handle_fasteoi_mask_irq(struct irq_desc *desc) { struct irq_chip *chip = desc->irq_data.chip; raw_spin_lock(&desc->lock); mask_ack_irq(desc); if (!irq_may_run(desc)) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If its disabled or no action available * then mask it and get out of here: */ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; } kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); preflow_handler(desc); handle_irq_event(desc); cond_unmask_eoi_irq(desc, chip); raw_spin_unlock(&desc->lock); return; out: if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED)) chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq); #endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */ /** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) * @data: Pointer to interrupt specific data */ void irq_chip_enable_parent(struct irq_data *data) { data = data->parent_data; if (data->chip->irq_enable) data->chip->irq_enable(data); else data->chip->irq_unmask(data); } EXPORT_SYMBOL_GPL(irq_chip_enable_parent); /** * irq_chip_disable_parent - Disable the parent interrupt (defaults to mask if * NULL) * @data: Pointer to interrupt specific data */ void irq_chip_disable_parent(struct irq_data *data) { data = data->parent_data; if (data->chip->irq_disable) data->chip->irq_disable(data); else data->chip->irq_mask(data); } EXPORT_SYMBOL_GPL(irq_chip_disable_parent); /** * irq_chip_ack_parent - Acknowledge the parent interrupt * @data: Pointer to interrupt specific data */ void irq_chip_ack_parent(struct irq_data *data) { data = data->parent_data; data->chip->irq_ack(data); } EXPORT_SYMBOL_GPL(irq_chip_ack_parent); /** * irq_chip_mask_parent - Mask the parent interrupt * @data: Pointer to interrupt specific data */ void irq_chip_mask_parent(struct irq_data *data) { data = data->parent_data; data->chip->irq_mask(data); } EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** * irq_chip_unmask_parent - Unmask the parent interrupt * @data: Pointer to interrupt specific data */ void irq_chip_unmask_parent(struct irq_data *data) { data = data->parent_data; data->chip->irq_unmask(data); } EXPORT_SYMBOL_GPL(irq_chip_unmask_parent); /** * irq_chip_eoi_parent - Invoke EOI on the parent interrupt * @data: Pointer to interrupt specific data */ void irq_chip_eoi_parent(struct irq_data *data) { data = data->parent_data; data->chip->irq_eoi(data); } EXPORT_SYMBOL_GPL(irq_chip_eoi_parent); /** * irq_chip_set_affinity_parent - Set affinity on the parent interrupt * @data: Pointer to interrupt specific data * @dest: The affinity mask to set * @force: Flag to enforce setting (disable online checks) * * Conditinal, as the underlying parent chip might not implement it. */ int irq_chip_set_affinity_parent(struct irq_data *data, const struct cpumask *dest, bool force) { data = data->parent_data; if (data->chip->irq_set_affinity) return data->chip->irq_set_affinity(data, dest, force); return -ENOSYS; } EXPORT_SYMBOL_GPL(irq_chip_set_affinity_parent); /** * irq_chip_set_type_parent - Set IRQ type on the parent interrupt * @data: Pointer to interrupt specific data * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h * * Conditional, as the underlying parent chip might not implement it. */ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) { data = data->parent_data; if (data->chip->irq_set_type) return data->chip->irq_set_type(data, type); return -ENOSYS; } EXPORT_SYMBOL_GPL(irq_chip_set_type_parent); /** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware * @data: Pointer to interrupt specific data * * Iterate through the domain hierarchy of the interrupt and check * whether a hw retrigger function exists. If yes, invoke it. */ int irq_chip_retrigger_hierarchy(struct irq_data *data) { for (data = data->parent_data; data; data = data->parent_data) if (data->chip && data->chip->irq_retrigger) return data->chip->irq_retrigger(data); return 0; } /** * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt * @data: Pointer to interrupt specific data * @vcpu_info: The vcpu affinity information */ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) { data = data->parent_data; if (data->chip->irq_set_vcpu_affinity) return data->chip->irq_set_vcpu_affinity(data, vcpu_info); return -ENOSYS; } /** * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt * @data: Pointer to interrupt specific data * @on: Whether to set or reset the wake-up capability of this irq * * Conditional, as the underlying parent chip might not implement it. */ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) { data = data->parent_data; if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE) return 0; if (data->chip->irq_set_wake) return data->chip->irq_set_wake(data, on); return -ENOSYS; } #endif /** * irq_chip_compose_msi_msg - Componse msi message for a irq chip * @data: Pointer to interrupt specific data * @msg: Pointer to the MSI message * * For hierarchical domains we find the first chip in the hierarchy * which implements the irq_compose_msi_msg callback. For non * hierarchical we use the top level chip. */ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { struct irq_data *pos = NULL; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY for (; data; data = data->parent_data) #endif if (data->chip && data->chip->irq_compose_msi_msg) pos = data; if (!pos) return -ENOSYS; pos->chip->irq_compose_msi_msg(pos, msg); return 0; } /** * irq_chip_pm_get - Enable power for an IRQ chip * @data: Pointer to interrupt specific data * * Enable the power to the IRQ chip referenced by the interrupt data * structure. */ int irq_chip_pm_get(struct irq_data *data) { int retval; if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) { retval = pm_runtime_get_sync(data->chip->parent_device); if (retval < 0) { pm_runtime_put_noidle(data->chip->parent_device); return retval; } } return 0; } /** * irq_chip_pm_put - Disable power for an IRQ chip * @data: Pointer to interrupt specific data * * Disable the power to the IRQ chip referenced by the interrupt data * structure, belongs. Note that power will only be disabled, once this * function has been called for all IRQs that have called irq_chip_pm_get(). */ int irq_chip_pm_put(struct irq_data *data) { int retval = 0; if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) retval = pm_runtime_put(data->chip->parent_device); return (retval < 0) ? retval : 0; } |
602 103 102 30 4 4 30 22 22 16 104 4 4 4 4 4 4 4 104 16 16 16 16 16 37 37 37 36 37 37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | /* * INETPEER - A storage for permanent information about peers * * This source is covered by the GNU GPL, the same as all kernel sources. * * Authors: Andrey V. Savochkin <saw@msu.ru> */ #include <linux/cache.h> #include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/random.h> #include <linux/timer.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/net.h> #include <linux/workqueue.h> #include <net/ip.h> #include <net/inetpeer.h> #include <net/secure_seq.h> /* * Theory of operations. * We keep one entry for each peer IP address. The nodes contains long-living * information about the peer which doesn't depend on routes. * * Nodes are removed only when reference counter goes to 0. * When it's happened the node may be removed when a sufficient amount of * time has been passed since its last use. The less-recently-used entry can * also be removed if the pool is overloaded i.e. if the total amount of * entries is greater-or-equal than the threshold. * * Node pool is organised as an RB tree. * Such an implementation has been chosen not just for fun. It's a way to * prevent easy and efficient DoS attacks by creating hash collisions. A huge * amount of long living nodes in a single hash slot would significantly delay * lookups performed with disabled BHs. * * Serialisation issues. * 1. Nodes may appear in the tree only with the pool lock held. * 2. Nodes may disappear from the tree only with the pool lock held * AND reference count being 0. * 3. Global variable peer_total is modified under the pool lock. * 4. struct inet_peer fields modification: * rb_node: pool lock * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable */ static struct kmem_cache *peer_cachep __ro_after_init; void inet_peer_base_init(struct inet_peer_base *bp) { bp->rb_root = RB_ROOT; seqlock_init(&bp->lock); bp->total = 0; } EXPORT_SYMBOL_GPL(inet_peer_base_init); #define PEER_MAX_GC 32 /* Exported for sysctl_net_ipv4. */ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more * aggressively at this stage */ int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) { struct sysinfo si; /* Use the straight interface to information about memory. */ si_meminfo(&si); /* The values below were suggested by Alexey Kuznetsov * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values * myself. --SAW */ if (si.totalram <= (32768*1024)/PAGE_SIZE) inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */ if (si.totalram <= (16384*1024)/PAGE_SIZE) inet_peer_threshold >>= 1; /* about 512KB */ if (si.totalram <= (8192*1024)/PAGE_SIZE) inet_peer_threshold >>= 2; /* about 128KB */ peer_cachep = kmem_cache_create("inet_peer_cache", sizeof(struct inet_peer), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } /* Called with rcu_read_lock() or base->lock held */ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, struct inet_peer_base *base, unsigned int seq, struct inet_peer *gc_stack[], unsigned int *gc_cnt, struct rb_node **parent_p, struct rb_node ***pp_p) { struct rb_node **pp, *parent, *next; struct inet_peer *p; pp = &base->rb_root.rb_node; parent = NULL; while (1) { int cmp; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; p = rb_entry(parent, struct inet_peer, rb_node); cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { if (!refcount_inc_not_zero(&p->refcnt)) break; return p; } if (gc_stack) { if (*gc_cnt < PEER_MAX_GC) gc_stack[(*gc_cnt)++] = p; } else if (unlikely(read_seqretry(&base->lock, seq))) { break; } if (cmp == -1) pp = &next->rb_left; else pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; return NULL; } static void inetpeer_free_rcu(struct rcu_head *head) { kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu)); } /* perform garbage collect on all items stacked during a lookup */ static void inet_peer_gc(struct inet_peer_base *base, struct inet_peer *gc_stack[], unsigned int gc_cnt) { int peer_threshold, peer_maxttl, peer_minttl; struct inet_peer *p; __u32 delta, ttl; int i; peer_threshold = READ_ONCE(inet_peer_threshold); peer_maxttl = READ_ONCE(inet_peer_maxttl); peer_minttl = READ_ONCE(inet_peer_minttl); if (base->total >= peer_threshold) ttl = 0; /* be aggressive */ else ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ * base->total / peer_threshold * HZ; for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; /* The READ_ONCE() pairs with the WRITE_ONCE() * in inet_putpeer() */ delta = (__u32)jiffies - READ_ONCE(p->dtime); if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL; } for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; if (p) { rb_erase(&p->rb_node, &base->rb_root); base->total--; call_rcu(&p->rcu, inetpeer_free_rcu); } } } struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, int create) { struct inet_peer *p, *gc_stack[PEER_MAX_GC]; struct rb_node **pp, *parent; unsigned int gc_cnt, seq; int invalidated; /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ rcu_read_lock(); seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); invalidated = read_seqretry(&base->lock, seq); rcu_read_unlock(); if (p) return p; /* If no writer did a change during our lookup, we can return early. */ if (!create && !invalidated) return NULL; /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ parent = NULL; write_seqlock_bh(&base->lock); gc_cnt = 0; p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); if (!p && create) { p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); if (p) { p->daddr = *daddr; p->dtime = (__u32)jiffies; refcount_set(&p->refcnt, 2); atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; p->n_redirects = 0; /* 60*HZ is arbitrary, but chosen enough high so that the first * calculation of tokens is at its maximum. */ p->rate_last = jiffies - 60*HZ; rb_link_node(&p->rb_node, parent, pp); rb_insert_color(&p->rb_node, &base->rb_root); base->total++; } } if (gc_cnt) inet_peer_gc(base, gc_stack, gc_cnt); write_sequnlock_bh(&base->lock); return p; } EXPORT_SYMBOL_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { /* The WRITE_ONCE() pairs with itself (we run lockless) * and the READ_ONCE() in inet_peer_gc() */ WRITE_ONCE(p->dtime, (__u32)jiffies); if (refcount_dec_and_test(&p->refcnt)) call_rcu(&p->rcu, inetpeer_free_rcu); } EXPORT_SYMBOL_GPL(inet_putpeer); /* * Check transmit rate limitation for given message. * The rate information is held in the inet_peer entries now. * This function is generic and could be used for other purposes * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. * * Note that the same inet_peer fields are modified by functions in * route.c too, but these work for packet destinations while xrlim_allow * works for icmp destinations. This means the rate limiting information * for one "ip object" is shared - and these ICMPs are twice limited: * by source and by destination. * * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate * SHOULD allow setting of rate limits * * Shared between ICMPv4 and ICMPv6. */ #define XRLIM_BURST_FACTOR 6 bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) { unsigned long now, token; bool rc = false; if (!peer) return true; token = peer->rate_tokens; now = jiffies; token += now - peer->rate_last; peer->rate_last = now; if (token > XRLIM_BURST_FACTOR * timeout) token = XRLIM_BURST_FACTOR * timeout; if (token >= timeout) { token -= timeout; rc = true; } peer->rate_tokens = token; return rc; } EXPORT_SYMBOL(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { struct rb_node *p = rb_first(&base->rb_root); while (p) { struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); p = rb_next(p); rb_erase(&peer->rb_node, &base->rb_root); inet_putpeer(peer); cond_resched(); } base->total = 0; } EXPORT_SYMBOL(inetpeer_invalidate_tree); |
56 53 56 1 1 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | // SPDX-License-Identifier: GPL-2.0 /* * Provide a default dump_stack() function for architectures * which don't implement their own. */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/smp.h> #include <linux/atomic.h> #include <linux/kexec.h> #include <linux/utsname.h> static char dump_stack_arch_desc_str[128]; /** * dump_stack_set_arch_desc - set arch-specific str to show with task dumps * @fmt: printf-style format string * @...: arguments for the format string * * The configured string will be printed right after utsname during task * dumps. Usually used to add arch-specific system identifiers. If an * arch wants to make use of such an ID string, it should initialize this * as soon as possible during boot. */ void __init dump_stack_set_arch_desc(const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str), fmt, args); va_end(args); } /** * dump_stack_print_info - print generic debug info for dump_stack() * @log_lvl: log level * * Arch-specific dump_stack() implementations can use this function to * print out the same debug information as the generic dump_stack(). */ void dump_stack_print_info(const char *log_lvl) { printk("%sCPU: %d PID: %d Comm: %.20s %s%s %s %.*s\n", log_lvl, raw_smp_processor_id(), current->pid, current->comm, kexec_crash_loaded() ? "Kdump: loaded " : "", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); if (dump_stack_arch_desc_str[0] != '\0') printk("%sHardware name: %s\n", log_lvl, dump_stack_arch_desc_str); print_worker_info(log_lvl, current); } /** * show_regs_print_info - print generic debug info for show_regs() * @log_lvl: log level * * show_regs() implementations can use this function to print out generic * debug information. */ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); } static void __dump_stack(void) { dump_stack_print_info(KERN_DEFAULT); show_stack(NULL, NULL); } /** * dump_stack - dump the current task information and its stack trace * * Architectures can override this implementation by implementing its own. */ #ifdef CONFIG_SMP static atomic_t dump_lock = ATOMIC_INIT(-1); asmlinkage __visible void dump_stack(void) { unsigned long flags; int was_locked; int old; int cpu; /* * Permit this cpu to perform nested stack dumps while serialising * against other CPUs */ retry: local_irq_save(flags); cpu = smp_processor_id(); old = atomic_cmpxchg(&dump_lock, -1, cpu); if (old == -1) { was_locked = 0; } else if (old == cpu) { was_locked = 1; } else { local_irq_restore(flags); /* * Wait for the lock to release before jumping to * atomic_cmpxchg() in order to mitigate the thundering herd * problem. */ do { cpu_relax(); } while (atomic_read(&dump_lock) != -1); goto retry; } __dump_stack(); if (!was_locked) atomic_set(&dump_lock, -1); local_irq_restore(flags); } #else asmlinkage __visible void dump_stack(void) { __dump_stack(); } #endif EXPORT_SYMBOL(dump_stack); |
18 3513 3505 3487 3517 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X_TABLES_ |