30 30 22 23 50 45 25 2 3 6 1 6 5 33 32 13 41 40 5 3 3 4 2 3 2 17 1 1 1 1 2 1 2 4 1 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 | // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * Timer control routines * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #include "seq_oss_timer.h" #include "seq_oss_event.h" #include <sound/seq_oss_legacy.h> #include <linux/slab.h> /* */ #define MIN_OSS_TEMPO 8 #define MAX_OSS_TEMPO 360 #define MIN_OSS_TIMEBASE 1 #define MAX_OSS_TIMEBASE 1000 /* */ static void calc_alsa_tempo(struct seq_oss_timer *timer); static int send_timer_event(struct seq_oss_devinfo *dp, int type, int value); /* * create and register a new timer. * if queue is not started yet, start it. */ struct seq_oss_timer * snd_seq_oss_timer_new(struct seq_oss_devinfo *dp) { struct seq_oss_timer *rec; rec = kzalloc(sizeof(*rec), GFP_KERNEL); if (rec == NULL) return NULL; rec->dp = dp; rec->cur_tick = 0; rec->realtime = 0; rec->running = 0; rec->oss_tempo = 60; rec->oss_timebase = 100; calc_alsa_tempo(rec); return rec; } /* * delete timer. * if no more timer exists, stop the queue. */ void snd_seq_oss_timer_delete(struct seq_oss_timer *rec) { if (rec) { snd_seq_oss_timer_stop(rec); kfree(rec); } } /* * process one timing event * return 1 : event proceseed -- skip this event * 0 : not a timer event -- enqueue this event */ int snd_seq_oss_process_timer_event(struct seq_oss_timer *rec, union evrec *ev) { abstime_t parm = ev->t.time; if (ev->t.code == EV_TIMING) { switch (ev->t.cmd) { case TMR_WAIT_REL: parm += rec->cur_tick; rec->realtime = 0; fallthrough; case TMR_WAIT_ABS: if (parm == 0) { rec->realtime = 1; } else if (parm >= rec->cur_tick) { rec->realtime = 0; rec->cur_tick = parm; } return 1; /* skip this event */ case TMR_START: snd_seq_oss_timer_start(rec); return 1; } } else if (ev->s.code == SEQ_WAIT) { /* time = from 1 to 3 bytes */ parm = (ev->echo >> 8) & 0xffffff; if (parm > rec->cur_tick) { /* set next event time */ rec->cur_tick = parm; rec->realtime = 0; } return 1; } return 0; } /* * convert tempo units */ static void calc_alsa_tempo(struct seq_oss_timer *timer) { timer->tempo = (60 * 1000000) / timer->oss_tempo; timer->ppq = timer->oss_timebase; } /* * dispatch a timer event */ static int send_timer_event(struct seq_oss_devinfo *dp, int type, int value) { struct snd_seq_event ev; memset(&ev, 0, sizeof(ev)); ev.type = type; ev.source.client = dp->cseq; ev.source.port = 0; ev.dest.client = SNDRV_SEQ_CLIENT_SYSTEM; ev.dest.port = SNDRV_SEQ_PORT_SYSTEM_TIMER; ev.queue = dp->queue; ev.data.queue.queue = dp->queue; ev.data.queue.param.value = value; return snd_seq_kernel_client_dispatch(dp->cseq, &ev, 1, 0); } /* * set queue tempo and start queue */ int snd_seq_oss_timer_start(struct seq_oss_timer *timer) { struct seq_oss_devinfo *dp = timer->dp; struct snd_seq_queue_tempo tmprec; if (timer->running) snd_seq_oss_timer_stop(timer); memset(&tmprec, 0, sizeof(tmprec)); tmprec.queue = dp->queue; tmprec.ppq = timer->ppq; tmprec.tempo = timer->tempo; snd_seq_set_queue_tempo(dp->cseq, &tmprec); send_timer_event(dp, SNDRV_SEQ_EVENT_START, 0); timer->running = 1; timer->cur_tick = 0; return 0; } /* * stop queue */ int snd_seq_oss_timer_stop(struct seq_oss_timer *timer) { if (! timer->running) return 0; send_timer_event(timer->dp, SNDRV_SEQ_EVENT_STOP, 0); timer->running = 0; return 0; } /* * continue queue */ int snd_seq_oss_timer_continue(struct seq_oss_timer *timer) { if (timer->running) return 0; send_timer_event(timer->dp, SNDRV_SEQ_EVENT_CONTINUE, 0); timer->running = 1; return 0; } /* * change queue tempo */ int snd_seq_oss_timer_tempo(struct seq_oss_timer *timer, int value) { if (value < MIN_OSS_TEMPO) value = MIN_OSS_TEMPO; else if (value > MAX_OSS_TEMPO) value = MAX_OSS_TEMPO; timer->oss_tempo = value; calc_alsa_tempo(timer); if (timer->running) send_timer_event(timer->dp, SNDRV_SEQ_EVENT_TEMPO, timer->tempo); return 0; } /* * ioctls */ int snd_seq_oss_timer_ioctl(struct seq_oss_timer *timer, unsigned int cmd, int __user *arg) { int value; if (cmd == SNDCTL_SEQ_CTRLRATE) { /* if *arg == 0, just return the current rate */ if (get_user(value, arg)) return -EFAULT; if (value) return -EINVAL; value = ((timer->oss_tempo * timer->oss_timebase) + 30) / 60; return put_user(value, arg) ? -EFAULT : 0; } if (timer->dp->seq_mode == SNDRV_SEQ_OSS_MODE_SYNTH) return 0; switch (cmd) { case SNDCTL_TMR_START: return snd_seq_oss_timer_start(timer); case SNDCTL_TMR_STOP: return snd_seq_oss_timer_stop(timer); case SNDCTL_TMR_CONTINUE: return snd_seq_oss_timer_continue(timer); case SNDCTL_TMR_TEMPO: if (get_user(value, arg)) return -EFAULT; return snd_seq_oss_timer_tempo(timer, value); case SNDCTL_TMR_TIMEBASE: if (get_user(value, arg)) return -EFAULT; if (value < MIN_OSS_TIMEBASE) value = MIN_OSS_TIMEBASE; else if (value > MAX_OSS_TIMEBASE) value = MAX_OSS_TIMEBASE; timer->oss_timebase = value; calc_alsa_tempo(timer); return 0; case SNDCTL_TMR_METRONOME: case SNDCTL_TMR_SELECT: case SNDCTL_TMR_SOURCE: /* not supported */ return 0; } return 0; } |
1 1 1 4 3 1 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,ip type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ /* 5 skbinfo support added */ #define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); /* Type specific function prefix */ #define HTYPE hash_ipportip /* IPv4 variant */ /* Member elements */ struct hash_ipportip4_elem { __be32 ip; __be32 ip2; __be16 port; u8 proto; u8 padding; }; static bool hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, const struct hash_ipportip4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->ip2 == ip2->ip2 && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipportip4_data_list(struct sk_buff *skb, const struct hash_ipportip4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportip4_data_next(struct hash_ipportip4_elem *next, const struct hash_ipportip4_elem *d) { next->ip = d->ip; next->port = d->port; } /* Common functions */ #define MTYPE hash_ipportip4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipportip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); if (i > IPSET_MAX_RANGE) { hash_ipportip4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } } return ret; } /* IPv6 variant */ struct hash_ipportip6_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipportip6_data_list(struct sk_buff *skb, const struct hash_ipportip6_elem *data) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportip6_data_next(struct hash_ipportip6_elem *next, const struct hash_ipportip6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipportip6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipportip6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipportip_type __read_mostly = { .name = "hash:ip,port,ip", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, .dimension = IPSET_DIM_THREE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipportip_init(void) { return ip_set_type_register(&hash_ipportip_type); } static void __exit hash_ipportip_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipportip_type); } module_init(hash_ipportip_init); module_exit(hash_ipportip_fini); |
52 52 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 | // SPDX-License-Identifier: GPL-2.0 /* * NFS exporting and validation. * * We maintain a list of clients, each of which has a list of * exports. To export an fs to a given client, you first have * to create the client entry with NFSCTL_ADDCLIENT, which * creates a client control block and adds it to the hash * table. Then, you call NFSCTL_EXPORT for each fs. * * * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de> */ #include <linux/slab.h> #include <linux/namei.h> #include <linux/module.h> #include <linux/exportfs.h> #include <linux/sunrpc/svc_xprt.h> #include "nfsd.h" #include "nfsfh.h" #include "netns.h" #include "pnfs.h" #include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT /* * We have two caches. * One maps client+vfsmnt+dentry to export options - the export map * The other maps client+filehandle-fragment to export options. - the expkey map * * The export options are actually stored in the first map, and the * second map contains a reference to the entry in the first map. */ #define EXPKEY_HASHBITS 8 #define EXPKEY_HASHMAX (1 << EXPKEY_HASHBITS) #define EXPKEY_HASHMASK (EXPKEY_HASHMAX -1) static void expkey_put(struct kref *ref) { struct svc_expkey *key = container_of(ref, struct svc_expkey, h.ref); if (test_bit(CACHE_VALID, &key->h.flags) && !test_bit(CACHE_NEGATIVE, &key->h.flags)) path_put(&key->ek_path); auth_domain_put(key->ek_client); kfree_rcu(key, ek_rcu); } static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall(cd, h); } static void expkey_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) { /* client fsidtype \xfsid */ struct svc_expkey *ek = container_of(h, struct svc_expkey, h); char type[5]; qword_add(bpp, blen, ek->ek_client->name); snprintf(type, 5, "%d", ek->ek_fsidtype); qword_add(bpp, blen, type); qword_addhex(bpp, blen, (char*)ek->ek_fsid, key_len(ek->ek_fsidtype)); (*bpp)[-1] = '\n'; } static struct svc_expkey *svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, struct svc_expkey *old); static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *); static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client fsidtype fsid expiry [path] */ char *buf; int len; struct auth_domain *dom = NULL; int err; int fsidtype; char *ep; struct svc_expkey key; struct svc_expkey *ek = NULL; if (mesg[mlen - 1] != '\n') return -EINVAL; mesg[mlen-1] = 0; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); err = -ENOMEM; if (!buf) goto out; err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; dom = auth_domain_find(buf); if (!dom) goto out; dprintk("found domain %s\n", buf); err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; fsidtype = simple_strtoul(buf, &ep, 10); if (*ep) goto out; dprintk("found fsidtype %d\n", fsidtype); if (key_len(fsidtype)==0) /* invalid type */ goto out; if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) goto out; dprintk("found fsid length %d\n", len); if (len != key_len(fsidtype)) goto out; /* OK, we seem to have a valid key */ key.h.flags = 0; err = get_expiry(&mesg, &key.h.expiry_time); if (err) goto out; key.ek_client = dom; key.ek_fsidtype = fsidtype; memcpy(key.ek_fsid, buf, len); ek = svc_expkey_lookup(cd, &key); err = -ENOMEM; if (!ek) goto out; /* now we want a pathname, or empty meaning NEGATIVE */ err = -EINVAL; len = qword_get(&mesg, buf, PAGE_SIZE); if (len < 0) goto out; dprintk("Path seems to be <%s>\n", buf); err = 0; if (len == 0) { set_bit(CACHE_NEGATIVE, &key.h.flags); ek = svc_expkey_update(cd, &key, ek); if (ek) trace_nfsd_expkey_update(ek, NULL); else err = -ENOMEM; } else { err = kern_path(buf, 0, &key.ek_path); if (err) goto out; dprintk("Found the path %s\n", buf); ek = svc_expkey_update(cd, &key, ek); if (ek) trace_nfsd_expkey_update(ek, buf); else err = -ENOMEM; path_put(&key.ek_path); } cache_flush(); out: if (ek) cache_put(&ek->h, cd); if (dom) auth_domain_put(dom); kfree(buf); return err; } static int expkey_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct svc_expkey *ek ; int i; if (h ==NULL) { seq_puts(m, "#domain fsidtype fsid [path]\n"); return 0; } ek = container_of(h, struct svc_expkey, h); seq_printf(m, "%s %d 0x", ek->ek_client->name, ek->ek_fsidtype); for (i=0; i < key_len(ek->ek_fsidtype)/4; i++) seq_printf(m, "%08x", ek->ek_fsid[i]); if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { seq_printf(m, " "); seq_path(m, &ek->ek_path, "\\ \t\n"); } seq_printf(m, "\n"); return 0; } static inline int expkey_match (struct cache_head *a, struct cache_head *b) { struct svc_expkey *orig = container_of(a, struct svc_expkey, h); struct svc_expkey *new = container_of(b, struct svc_expkey, h); if (orig->ek_fsidtype != new->ek_fsidtype || orig->ek_client != new->ek_client || memcmp(orig->ek_fsid, new->ek_fsid, key_len(orig->ek_fsidtype)) != 0) return 0; return 1; } static inline void expkey_init(struct cache_head *cnew, struct cache_head *citem) { struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); struct svc_expkey *item = container_of(citem, struct svc_expkey, h); kref_get(&item->ek_client->ref); new->ek_client = item->ek_client; new->ek_fsidtype = item->ek_fsidtype; memcpy(new->ek_fsid, item->ek_fsid, sizeof(new->ek_fsid)); } static inline void expkey_update(struct cache_head *cnew, struct cache_head *citem) { struct svc_expkey *new = container_of(cnew, struct svc_expkey, h); struct svc_expkey *item = container_of(citem, struct svc_expkey, h); new->ek_path = item->ek_path; path_get(&item->ek_path); } static struct cache_head *expkey_alloc(void) { struct svc_expkey *i = kmalloc(sizeof(*i), GFP_KERNEL); if (i) return &i->h; else return NULL; } static void expkey_flush(void) { /* * Take the nfsd_mutex here to ensure that the file cache is not * destroyed while we're in the middle of flushing. */ mutex_lock(&nfsd_mutex); nfsd_file_cache_purge(current->nsproxy->net_ns); mutex_unlock(&nfsd_mutex); } static const struct cache_detail svc_expkey_cache_template = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, .name = "nfsd.fh", .cache_put = expkey_put, .cache_upcall = expkey_upcall, .cache_request = expkey_request, .cache_parse = expkey_parse, .cache_show = expkey_show, .match = expkey_match, .init = expkey_init, .update = expkey_update, .alloc = expkey_alloc, .flush = expkey_flush, }; static int svc_expkey_hash(struct svc_expkey *item) { int hash = item->ek_fsidtype; char * cp = (char*)item->ek_fsid; int len = key_len(item->ek_fsidtype); hash ^= hash_mem(cp, len, EXPKEY_HASHBITS); hash ^= hash_ptr(item->ek_client, EXPKEY_HASHBITS); hash &= EXPKEY_HASHMASK; return hash; } static struct svc_expkey * svc_expkey_lookup(struct cache_detail *cd, struct svc_expkey *item) { struct cache_head *ch; int hash = svc_expkey_hash(item); ch = sunrpc_cache_lookup_rcu(cd, &item->h, hash); if (ch) return container_of(ch, struct svc_expkey, h); else return NULL; } static struct svc_expkey * svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new, struct svc_expkey *old) { struct cache_head *ch; int hash = svc_expkey_hash(new); ch = sunrpc_cache_update(cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct svc_expkey, h); else return NULL; } #define EXPORT_HASHBITS 8 #define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) { struct nfsd4_fs_location *locations = fsloc->locations; int i; if (!locations) return; for (i = 0; i < fsloc->locations_count; i++) { kfree(locations[i].path); kfree(locations[i].hosts); } kfree(locations); fsloc->locations = NULL; } static int export_stats_init(struct export_stats *stats) { stats->start_time = ktime_get_seconds(); return percpu_counter_init_many(stats->counter, 0, GFP_KERNEL, EXP_STATS_COUNTERS_NUM); } static void export_stats_reset(struct export_stats *stats) { if (stats) { int i; for (i = 0; i < EXP_STATS_COUNTERS_NUM; i++) percpu_counter_set(&stats->counter[i], 0); } } static void export_stats_destroy(struct export_stats *stats) { if (stats) percpu_counter_destroy_many(stats->counter, EXP_STATS_COUNTERS_NUM); } static void svc_export_put(struct kref *ref) { struct svc_export *exp = container_of(ref, struct svc_export, h.ref); path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); export_stats_destroy(exp->ex_stats); kfree(exp->ex_stats); kfree(exp->ex_uuid); kfree_rcu(exp, ex_rcu); } static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) { return sunrpc_cache_pipe_upcall(cd, h); } static void svc_export_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) { /* client path */ struct svc_export *exp = container_of(h, struct svc_export, h); char *pth; qword_add(bpp, blen, exp->ex_client->name); pth = d_path(&exp->ex_path, *bpp, *blen); if (IS_ERR(pth)) { /* is this correct? */ (*bpp)[0] = '\n'; return; } qword_add(bpp, blen, pth); (*bpp)[-1] = '\n'; } static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); static int check_export(struct path *path, int *flags, unsigned char *uuid) { struct inode *inode = d_inode(path->dentry); /* * We currently export only dirs, regular files, and (for v4 * pseudoroot) symlinks. */ if (!S_ISDIR(inode->i_mode) && !S_ISLNK(inode->i_mode) && !S_ISREG(inode->i_mode)) return -ENOTDIR; /* * Mountd should never pass down a writeable V4ROOT export, but, * just to make sure: */ if (*flags & NFSEXP_V4ROOT) *flags |= NFSEXP_READONLY; /* There are two requirements on a filesystem to be exportable. * 1: We must be able to identify the filesystem from a number. * either a device number (so FS_REQUIRES_DEV needed) * or an FSID number (so NFSEXP_FSID or ->uuid is needed). * 2: We must be able to find an inode from a filehandle. * This means that s_export_op must be set. * 3: We must not currently be on an idmapped mount. */ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && !(*flags & NFSEXP_FSID) && uuid == NULL) { dprintk("exp_export: export of non-dev fs without fsid\n"); return -EINVAL; } if (!exportfs_can_decode_fh(inode->i_sb->s_export_op)) { dprintk("exp_export: export of invalid fs type.\n"); return -EINVAL; } if (is_idmapped_mnt(path->mnt)) { dprintk("exp_export: export of idmapped mounts not yet supported.\n"); return -EINVAL; } if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK && !(*flags & NFSEXP_NOSUBTREECHECK)) { dprintk("%s: %s does not support subtree checking!\n", __func__, inode->i_sb->s_type->name); return -EINVAL; } return 0; } #ifdef CONFIG_NFSD_V4 static int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { int len; int migrated, i, err; /* more than one fsloc */ if (fsloc->locations) return -EINVAL; /* listsize */ err = get_uint(mesg, &fsloc->locations_count); if (err) return err; if (fsloc->locations_count > MAX_FS_LOCATIONS) return -EINVAL; if (fsloc->locations_count == 0) return 0; fsloc->locations = kcalloc(fsloc->locations_count, sizeof(struct nfsd4_fs_location), GFP_KERNEL); if (!fsloc->locations) return -ENOMEM; for (i=0; i < fsloc->locations_count; i++) { /* colon separated host list */ err = -EINVAL; len = qword_get(mesg, buf, PAGE_SIZE); if (len <= 0) goto out_free_all; err = -ENOMEM; fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL); if (!fsloc->locations[i].hosts) goto out_free_all; err = -EINVAL; /* slash separated path component list */ len = qword_get(mesg, buf, PAGE_SIZE); if (len <= 0) goto out_free_all; err = -ENOMEM; fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL); if (!fsloc->locations[i].path) goto out_free_all; } /* migrated */ err = get_int(mesg, &migrated); if (err) goto out_free_all; err = -EINVAL; if (migrated < 0 || migrated > 1) goto out_free_all; fsloc->migrated = migrated; return 0; out_free_all: nfsd4_fslocs_free(fsloc); return err; } static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { struct exp_flavor_info *f; u32 listsize; int err; /* more than one secinfo */ if (exp->ex_nflavors) return -EINVAL; err = get_uint(mesg, &listsize); if (err) return err; if (listsize > MAX_SECINFO_LIST) return -EINVAL; for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) { err = get_uint(mesg, &f->pseudoflavor); if (err) return err; /* * XXX: It would be nice to also check whether this * pseudoflavor is supported, so we can discover the * problem at export time instead of when a client fails * to authenticate. */ err = get_uint(mesg, &f->flags); if (err) return err; /* Only some flags are allowed to differ between flavors: */ if (~NFSEXP_SECINFO_FLAGS & (f->flags ^ exp->ex_flags)) return -EINVAL; } exp->ex_nflavors = listsize; return 0; } #else /* CONFIG_NFSD_V4 */ static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc){return 0;} static inline int secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; } #endif static int xprtsec_parse(char **mesg, char *buf, struct svc_export *exp) { unsigned int i, mode, listsize; int err; err = get_uint(mesg, &listsize); if (err) return err; if (listsize > NFSEXP_XPRTSEC_NUM) return -EINVAL; exp->ex_xprtsec_modes = 0; for (i = 0; i < listsize; i++) { err = get_uint(mesg, &mode); if (err) return err; if (mode > NFSEXP_XPRTSEC_MTLS) return -EINVAL; exp->ex_xprtsec_modes |= mode; } return 0; } static inline int nfsd_uuid_parse(char **mesg, char *buf, unsigned char **puuid) { int len; /* more than one uuid */ if (*puuid) return -EINVAL; /* expect a 16 byte uuid encoded as \xXXXX... */ len = qword_get(mesg, buf, PAGE_SIZE); if (len != EX_UUID_LEN) return -EINVAL; *puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL); if (*puuid == NULL) return -ENOMEM; return 0; } static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client path expiry [flags anonuid anongid fsid] */ char *buf; int err; struct auth_domain *dom = NULL; struct svc_export exp = {}, *expp; int an_int; if (mesg[mlen-1] != '\n') return -EINVAL; mesg[mlen-1] = 0; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) return -ENOMEM; /* client */ err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out; err = -ENOENT; dom = auth_domain_find(buf); if (!dom) goto out; /* path */ err = -EINVAL; if (qword_get(&mesg, buf, PAGE_SIZE) <= 0) goto out1; err = kern_path(buf, 0, &exp.ex_path); if (err) goto out1; exp.ex_client = dom; exp.cd = cd; exp.ex_devid_map = NULL; exp.ex_xprtsec_modes = NFSEXP_XPRTSEC_ALL; /* expiry */ err = get_expiry(&mesg, &exp.h.expiry_time); if (err) goto out3; /* flags */ err = get_int(&mesg, &an_int); if (err == -ENOENT) { err = 0; set_bit(CACHE_NEGATIVE, &exp.h.flags); } else { if (err || an_int < 0) goto out3; exp.ex_flags= an_int; /* anon uid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_anon_uid= make_kuid(current_user_ns(), an_int); /* anon gid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_anon_gid= make_kgid(current_user_ns(), an_int); /* fsid */ err = get_int(&mesg, &an_int); if (err) goto out3; exp.ex_fsid = an_int; while (qword_get(&mesg, buf, PAGE_SIZE) > 0) { if (strcmp(buf, "fsloc") == 0) err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); else if (strcmp(buf, "uuid") == 0) err = nfsd_uuid_parse(&mesg, buf, &exp.ex_uuid); else if (strcmp(buf, "secinfo") == 0) err = secinfo_parse(&mesg, buf, &exp); else if (strcmp(buf, "xprtsec") == 0) err = xprtsec_parse(&mesg, buf, &exp); else /* quietly ignore unknown words and anything * following. Newer user-space can try to set * new values, then see what the result was. */ break; if (err) goto out4; } err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid); if (err) goto out4; /* * No point caching this if it would immediately expire. * Also, this protects exportfs's dummy export from the * anon_uid/anon_gid checks: */ if (exp.h.expiry_time < seconds_since_boot()) goto out4; /* * For some reason exportfs has been passing down an * invalid (-1) uid & gid on the "dummy" export which it * uses to test export support. To make sure exportfs * sees errors from check_export we therefore need to * delay these checks till after check_export: */ err = -EINVAL; if (!uid_valid(exp.ex_anon_uid)) goto out4; if (!gid_valid(exp.ex_anon_gid)) goto out4; err = 0; nfsd4_setup_layout_type(&exp); } expp = svc_export_lookup(&exp); if (!expp) { err = -ENOMEM; goto out4; } expp = svc_export_update(&exp, expp); if (expp) { trace_nfsd_export_update(expp); cache_flush(); exp_put(expp); } else err = -ENOMEM; out4: nfsd4_fslocs_free(&exp.ex_fslocs); kfree(exp.ex_uuid); out3: path_put(&exp.ex_path); out1: auth_domain_put(dom); out: kfree(buf); return err; } static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs); static void show_secinfo(struct seq_file *m, struct svc_export *exp); static int is_export_stats_file(struct seq_file *m) { /* * The export_stats file uses the same ops as the exports file. * We use the file's name to determine the reported info per export. * There is no rename in nsfdfs, so d_name.name is stable. */ return !strcmp(m->file->f_path.dentry->d_name.name, "export_stats"); } static int svc_export_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) { struct svc_export *exp; bool export_stats = is_export_stats_file(m); if (h == NULL) { if (export_stats) seq_puts(m, "#path domain start-time\n#\tstats\n"); else seq_puts(m, "#path domain(flags)\n"); return 0; } exp = container_of(h, struct svc_export, h); seq_path(m, &exp->ex_path, " \t\n\\"); seq_putc(m, '\t'); seq_escape(m, exp->ex_client->name, " \t\n\\"); if (export_stats) { struct percpu_counter *counter = exp->ex_stats->counter; seq_printf(m, "\t%lld\n", exp->ex_stats->start_time); seq_printf(m, "\tfh_stale: %lld\n", percpu_counter_sum_positive(&counter[EXP_STATS_FH_STALE])); seq_printf(m, "\tio_read: %lld\n", percpu_counter_sum_positive(&counter[EXP_STATS_IO_READ])); seq_printf(m, "\tio_write: %lld\n", percpu_counter_sum_positive(&counter[EXP_STATS_IO_WRITE])); seq_putc(m, '\n'); return 0; } seq_putc(m, '('); if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) { exp_flags(m, exp->ex_flags, exp->ex_fsid, exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); if (exp->ex_uuid) { int i; seq_puts(m, ",uuid="); for (i = 0; i < EX_UUID_LEN; i++) { if ((i&3) == 0 && i) seq_putc(m, ':'); seq_printf(m, "%02x", exp->ex_uuid[i]); } } show_secinfo(m, exp); } seq_puts(m, ")\n"); return 0; } static int svc_export_match(struct cache_head *a, struct cache_head *b) { struct svc_export *orig = container_of(a, struct svc_export, h); struct svc_export *new = container_of(b, struct svc_export, h); return orig->ex_client == new->ex_client && path_equal(&orig->ex_path, &new->ex_path); } static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) { struct svc_export *new = container_of(cnew, struct svc_export, h); struct svc_export *item = container_of(citem, struct svc_export, h); kref_get(&item->ex_client->ref); new->ex_client = item->ex_client; new->ex_path = item->ex_path; path_get(&item->ex_path); new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; export_stats_reset(new->ex_stats); } static void export_update(struct cache_head *cnew, struct cache_head *citem) { struct svc_export *new = container_of(cnew, struct svc_export, h); struct svc_export *item = container_of(citem, struct svc_export, h); int i; new->ex_flags = item->ex_flags; new->ex_anon_uid = item->ex_anon_uid; new->ex_anon_gid = item->ex_anon_gid; new->ex_fsid = item->ex_fsid; new->ex_devid_map = item->ex_devid_map; item->ex_devid_map = NULL; new->ex_uuid = item->ex_uuid; item->ex_uuid = NULL; new->ex_fslocs.locations = item->ex_fslocs.locations; item->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; item->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = item->ex_fslocs.migrated; item->ex_fslocs.migrated = 0; new->ex_layout_types = item->ex_layout_types; new->ex_nflavors = item->ex_nflavors; for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; } new->ex_xprtsec_modes = item->ex_xprtsec_modes; } static struct cache_head *svc_export_alloc(void) { struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL); if (!i) return NULL; i->ex_stats = kmalloc(sizeof(*(i->ex_stats)), GFP_KERNEL); if (!i->ex_stats) { kfree(i); return NULL; } if (export_stats_init(i->ex_stats)) { kfree(i->ex_stats); kfree(i); return NULL; } return &i->h; } static const struct cache_detail svc_export_cache_template = { .owner = THIS_MODULE, .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", .cache_put = svc_export_put, .cache_upcall = svc_export_upcall, .cache_request = svc_export_request, .cache_parse = svc_export_parse, .cache_show = svc_export_show, .match = svc_export_match, .init = svc_export_init, .update = export_update, .alloc = svc_export_alloc, }; static int svc_export_hash(struct svc_export *exp) { int hash; hash = hash_ptr(exp->ex_client, EXPORT_HASHBITS); hash ^= hash_ptr(exp->ex_path.dentry, EXPORT_HASHBITS); hash ^= hash_ptr(exp->ex_path.mnt, EXPORT_HASHBITS); return hash; } static struct svc_export * svc_export_lookup(struct svc_export *exp) { struct cache_head *ch; int hash = svc_export_hash(exp); ch = sunrpc_cache_lookup_rcu(exp->cd, &exp->h, hash); if (ch) return container_of(ch, struct svc_export, h); else return NULL; } static struct svc_export * svc_export_update(struct svc_export *new, struct svc_export *old) { struct cache_head *ch; int hash = svc_export_hash(old); ch = sunrpc_cache_update(old->cd, &new->h, &old->h, hash); if (ch) return container_of(ch, struct svc_export, h); else return NULL; } static struct svc_expkey * exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) { struct svc_expkey key, *ek; int err; if (!clp) return ERR_PTR(-ENOENT); key.ek_client = clp; key.ek_fsidtype = fsid_type; memcpy(key.ek_fsid, fsidv, key_len(fsid_type)); ek = svc_expkey_lookup(cd, &key); if (ek == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &ek->h, reqp); if (err) { trace_nfsd_exp_find_key(&key, err); return ERR_PTR(err); } return ek; } static struct svc_export * exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp, const struct path *path, struct cache_req *reqp) { struct svc_export *exp, key; int err; if (!clp) return ERR_PTR(-ENOENT); key.ex_client = clp; key.ex_path = *path; key.cd = cd; exp = svc_export_lookup(&key); if (exp == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &exp->h, reqp); if (err) { trace_nfsd_exp_get_by_name(&key, err); return ERR_PTR(err); } return exp; } /* * Find the export entry for a given dentry. */ static struct svc_export * exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path) { struct dentry *saved = dget(path->dentry); struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL); while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { struct dentry *parent = dget_parent(path->dentry); dput(path->dentry); path->dentry = parent; exp = exp_get_by_name(cd, clp, path, NULL); } dput(path->dentry); path->dentry = saved; return exp; } /* * Obtain the root fh on behalf of a client. * This could be done in user space, but I feel that it adds some safety * since its harder to fool a kernel module than a user space program. */ int exp_rootfh(struct net *net, struct auth_domain *clp, char *name, struct knfsd_fh *f, int maxsize) { struct svc_export *exp; struct path path; struct inode *inode; struct svc_fh fh; int err; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; err = -EPERM; /* NB: we probably ought to check that it's NUL-terminated */ if (kern_path(name, 0, &path)) { printk("nfsd: exp_rootfh path not found %s", name); return err; } inode = d_inode(path.dentry); dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", name, path.dentry, clp->name, inode->i_sb->s_id, inode->i_ino); exp = exp_parent(cd, clp, &path); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto out; } /* * fh must be initialized before calling fh_compose */ fh_init(&fh, maxsize); if (fh_compose(&fh, exp, path.dentry, NULL)) err = -EINVAL; else err = 0; memcpy(f, &fh.fh_handle, sizeof(struct knfsd_fh)); fh_put(&fh); exp_put(exp); out: path_put(&path); return err; } static struct svc_export *exp_find(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, u32 *fsidv, struct cache_req *reqp) { struct svc_export *exp; struct nfsd_net *nn = net_generic(cd->net, nfsd_net_id); struct svc_expkey *ek = exp_find_key(nn->svc_expkey_cache, clp, fsid_type, fsidv, reqp); if (IS_ERR(ek)) return ERR_CAST(ek); exp = exp_get_by_name(cd, clp, &ek->ek_path, reqp); cache_put(&ek->h, nn->svc_expkey_cache); if (IS_ERR(exp)) return ERR_CAST(exp); return exp; } /** * check_nfsd_access - check if access to export is allowed. * @exp: svc_export that is being accessed. * @rqstp: svc_rqst attempting to access @exp (will be NULL for LOCALIO). * @may_bypass_gss: reduce strictness of authorization check * * Return values: * %nfs_ok if access is granted, or * %nfserr_wrongsec if access is denied */ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, bool may_bypass_gss) { struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; struct svc_xprt *xprt; /* * If rqstp is NULL, this is a LOCALIO request which will only * ever use a filehandle/credential pair for which access has * been affirmed (by ACCESS or OPEN NFS requests) over the * wire. So there is no need for further checks here. */ if (!rqstp) return nfs_ok; xprt = rqstp->rq_xprt; if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) { if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) goto ok; } if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) { if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) goto ok; } if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) { if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) goto ok; } goto denied; ok: /* legacy gss-only clients are always OK: */ if (exp->ex_client == rqstp->rq_gssclient) return nfs_ok; /* ip-address based client; check sec= export option: */ for (f = exp->ex_flavors; f < end; f++) { if (f->pseudoflavor == rqstp->rq_cred.cr_flavor) return nfs_ok; } /* defaults in absence of sec= options: */ if (exp->ex_nflavors == 0) { if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL || rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX) return nfs_ok; } /* If the compound op contains a spo_must_allowed op, * it will be sent with integrity/protection which * will have to be expressly allowed on mounts that * don't support it */ if (nfsd4_spo_must_allow(rqstp)) return nfs_ok; /* Some calls may be processed without authentication * on GSS exports. For example NFS2/3 calls on root * directory, see section 2.3.2 of rfc 2623. * For "may_bypass_gss" check that export has really * enabled some flavor with authentication (GSS or any * other) and also check that the used auth flavor is * without authentication (none or sys). */ if (may_bypass_gss && ( rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL || rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)) { for (f = exp->ex_flavors; f < end; f++) { if (f->pseudoflavor >= RPC_AUTH_DES) return 0; } } denied: return nfserr_wrongsec; } /* * Uses rq_client and rq_gssclient to find an export; uses rq_client (an * auth_unix client) if it's available and has secinfo information; * otherwise, will try to use rq_gssclient. * * Called from functions that handle requests; functions that do work on * behalf of mountd are passed a single client name to use, and should * use exp_get_by_name() or exp_find(). */ struct svc_export * rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct cache_detail *cd = nn->svc_export_cache; if (rqstp->rq_client == NULL) goto gss; /* First try the auth_unix client: */ exp = exp_get_by_name(cd, rqstp->rq_client, path, &rqstp->rq_chandle); if (PTR_ERR(exp) == -ENOENT) goto gss; if (IS_ERR(exp)) return exp; /* If it has secinfo, assume there are no gss/... clients */ if (exp->ex_nflavors > 0) return exp; gss: /* Otherwise, try falling back on gss client */ if (rqstp->rq_gssclient == NULL) return exp; gssexp = exp_get_by_name(cd, rqstp->rq_gssclient, path, &rqstp->rq_chandle); if (PTR_ERR(gssexp) == -ENOENT) return exp; if (!IS_ERR(exp)) exp_put(exp); return gssexp; } /** * rqst_exp_find - Find an svc_export in the context of a rqst or similar * @reqp: The handle to be used to suspend the request if a cache-upcall is needed * If NULL, missing in-cache information will result in failure. * @net: The network namespace in which the request exists * @cl: default auth_domain to use for looking up the export * @gsscl: an alternate auth_domain defined using deprecated gss/krb5 format. * @fsid_type: The type of fsid to look for * @fsidv: The actual fsid to look up in the context of either client. * * Perform a lookup for @cl/@fsidv in the given @net for an export. If * none found and @gsscl specified, repeat the lookup. * * Returns an export, or an error pointer. */ struct svc_export * rqst_exp_find(struct cache_req *reqp, struct net *net, struct auth_domain *cl, struct auth_domain *gsscl, int fsid_type, u32 *fsidv) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); struct cache_detail *cd = nn->svc_export_cache; if (!cl) goto gss; /* First try the auth_unix client: */ exp = exp_find(cd, cl, fsid_type, fsidv, reqp); if (PTR_ERR(exp) == -ENOENT) goto gss; if (IS_ERR(exp)) return exp; /* If it has secinfo, assume there are no gss/... clients */ if (exp->ex_nflavors > 0) return exp; gss: /* Otherwise, try falling back on gss client */ if (!gsscl) return exp; gssexp = exp_find(cd, gsscl, fsid_type, fsidv, reqp); if (PTR_ERR(gssexp) == -ENOENT) return exp; if (!IS_ERR(exp)) exp_put(exp); return gssexp; } struct svc_export * rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) { struct dentry *saved = dget(path->dentry); struct svc_export *exp = rqst_exp_get_by_name(rqstp, path); while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { struct dentry *parent = dget_parent(path->dentry); dput(path->dentry); path->dentry = parent; exp = rqst_exp_get_by_name(rqstp, path); } dput(path->dentry); path->dentry = saved; return exp; } struct svc_export *rqst_find_fsidzero_export(struct svc_rqst *rqstp) { u32 fsidv[2]; mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); return rqst_exp_find(&rqstp->rq_chandle, SVC_NET(rqstp), rqstp->rq_client, rqstp->rq_gssclient, FSID_NUM, fsidv); } /* * Called when we need the filehandle for the root of the pseudofs, * for a given NFSv4 client. The root is defined to be the * export point with fsid==0 */ __be32 exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) { struct svc_export *exp; __be32 rv; exp = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); exp_put(exp); return rv; } static struct flags { int flag; char *name[2]; } expflags[] = { { NFSEXP_READONLY, {"ro", "rw"}}, { NFSEXP_INSECURE_PORT, {"insecure", ""}}, { NFSEXP_ROOTSQUASH, {"root_squash", "no_root_squash"}}, { NFSEXP_ALLSQUASH, {"all_squash", ""}}, { NFSEXP_ASYNC, {"async", "sync"}}, { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}}, { NFSEXP_NOHIDE, {"nohide", ""}}, { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, { NFSEXP_PNFS, {"pnfs", ""}}, { NFSEXP_SECURITY_LABEL, {"security_label", ""}}, { 0, {"", ""}} }; static void show_expflags(struct seq_file *m, int flags, int mask) { struct flags *flg; int state, first = 0; for (flg = expflags; flg->flag; flg++) { if (flg->flag & ~mask) continue; state = (flg->flag & flags) ? 0 : 1; if (*flg->name[state]) seq_printf(m, "%s%s", first++?",":"", flg->name[state]); } } static void show_secinfo_flags(struct seq_file *m, int flags) { seq_printf(m, ","); show_expflags(m, flags, NFSEXP_SECINFO_FLAGS); } static bool secinfo_flags_equal(int f, int g) { f &= NFSEXP_SECINFO_FLAGS; g &= NFSEXP_SECINFO_FLAGS; return f == g; } static int show_secinfo_run(struct seq_file *m, struct exp_flavor_info **fp, struct exp_flavor_info *end) { int flags; flags = (*fp)->flags; seq_printf(m, ",sec=%d", (*fp)->pseudoflavor); (*fp)++; while (*fp != end && secinfo_flags_equal(flags, (*fp)->flags)) { seq_printf(m, ":%d", (*fp)->pseudoflavor); (*fp)++; } return flags; } static void show_secinfo(struct seq_file *m, struct svc_export *exp) { struct exp_flavor_info *f; struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors; int flags; if (exp->ex_nflavors == 0) return; f = exp->ex_flavors; flags = show_secinfo_run(m, &f, end); if (!secinfo_flags_equal(flags, exp->ex_flags)) show_secinfo_flags(m, flags); while (f != end) { flags = show_secinfo_run(m, &f, end); show_secinfo_flags(m, flags); } } static void exp_flags(struct seq_file *m, int flag, int fsid, kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fsloc) { struct user_namespace *userns = m->file->f_cred->user_ns; show_expflags(m, flag, NFSEXP_ALLFLAGS); if (flag & NFSEXP_FSID) seq_printf(m, ",fsid=%d", fsid); if (!uid_eq(anonu, make_kuid(userns, (uid_t)-2)) && !uid_eq(anonu, make_kuid(userns, 0x10000-2))) seq_printf(m, ",anonuid=%u", from_kuid_munged(userns, anonu)); if (!gid_eq(anong, make_kgid(userns, (gid_t)-2)) && !gid_eq(anong, make_kgid(userns, 0x10000-2))) seq_printf(m, ",anongid=%u", from_kgid_munged(userns, anong)); if (fsloc && fsloc->locations_count > 0) { char *loctype = (fsloc->migrated) ? "refer" : "replicas"; int i; seq_printf(m, ",%s=", loctype); seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\"); seq_putc(m, '@'); seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\"); for (i = 1; i < fsloc->locations_count; i++) { seq_putc(m, ';'); seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\"); seq_putc(m, '@'); seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\"); } } } static int e_show(struct seq_file *m, void *p) { struct cache_head *cp = p; struct svc_export *exp = container_of(cp, struct svc_export, h); struct cache_detail *cd = m->private; bool export_stats = is_export_stats_file(m); if (p == SEQ_START_TOKEN) { seq_puts(m, "# Version 1.1\n"); if (export_stats) seq_puts(m, "# Path Client Start-time\n#\tStats\n"); else seq_puts(m, "# Path Client(Flags) # IPs\n"); return 0; } if (!cache_get_rcu(&exp->h)) return 0; if (cache_check(cd, &exp->h, NULL)) return 0; exp_put(exp); return svc_export_show(m, cd, cp); } const struct seq_operations nfs_exports_op = { .start = cache_seq_start_rcu, .next = cache_seq_next_rcu, .stop = cache_seq_stop_rcu, .show = e_show, }; /* * Initialize the exports module. */ int nfsd_export_init(struct net *net) { int rv; struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("nfsd: initializing export module (net: %x).\n", net->ns.inum); nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net); if (IS_ERR(nn->svc_export_cache)) return PTR_ERR(nn->svc_export_cache); rv = cache_register_net(nn->svc_export_cache, net); if (rv) goto destroy_export_cache; nn->svc_expkey_cache = cache_create_net(&svc_expkey_cache_template, net); if (IS_ERR(nn->svc_expkey_cache)) { rv = PTR_ERR(nn->svc_expkey_cache); goto unregister_export_cache; } rv = cache_register_net(nn->svc_expkey_cache, net); if (rv) goto destroy_expkey_cache; return 0; destroy_expkey_cache: cache_destroy_net(nn->svc_expkey_cache, net); unregister_export_cache: cache_unregister_net(nn->svc_export_cache, net); destroy_export_cache: cache_destroy_net(nn->svc_export_cache, net); return rv; } /* * Flush exports table - called when last nfsd thread is killed */ void nfsd_export_flush(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); cache_purge(nn->svc_expkey_cache); cache_purge(nn->svc_export_cache); } /* * Shutdown the exports module. */ void nfsd_export_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); dprintk("nfsd: shutting down export module (net: %x).\n", net->ns.inum); cache_unregister_net(nn->svc_expkey_cache, net); cache_unregister_net(nn->svc_export_cache, net); cache_destroy_net(nn->svc_expkey_cache, net); cache_destroy_net(nn->svc_export_cache, net); svcauth_unix_purge(net); dprintk("nfsd: export shutdown complete (net: %x).\n", net->ns.inum); } |
10 2 10 8 10 3 8 3 8 10 8 21 21 20 20 20 20 20 3 3 2 4 1 10 3 2 5 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | // SPDX-License-Identifier: GPL-2.0-only #include <net/xdp_sock_drv.h> #include "netlink.h" #include "common.h" struct channels_req_info { struct ethnl_req_info base; }; struct channels_reply_data { struct ethnl_reply_data base; struct ethtool_channels channels; }; #define CHANNELS_REPDATA(__reply_base) \ container_of(__reply_base, struct channels_reply_data, base) const struct nla_policy ethnl_channels_get_policy[] = { [ETHTOOL_A_CHANNELS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int channels_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct channels_reply_data *data = CHANNELS_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_channels) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_channels(dev, &data->channels); ethnl_ops_complete(dev); return 0; } static int channels_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_MAX */ nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_MAX */ nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_MAX */ nla_total_size(sizeof(u32)) + /* _CHANNELS_COMBINED_MAX */ nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_COUNT */ nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_COUNT */ nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_COUNT */ nla_total_size(sizeof(u32)); /* _CHANNELS_COMBINED_COUNT */ } static int channels_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct channels_reply_data *data = CHANNELS_REPDATA(reply_base); const struct ethtool_channels *channels = &data->channels; if ((channels->max_rx && (nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_MAX, channels->max_rx) || nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_COUNT, channels->rx_count))) || (channels->max_tx && (nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_MAX, channels->max_tx) || nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_COUNT, channels->tx_count))) || (channels->max_other && (nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_MAX, channels->max_other) || nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_COUNT, channels->other_count))) || (channels->max_combined && (nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_MAX, channels->max_combined) || nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_COUNT, channels->combined_count)))) return -EMSGSIZE; return 0; } /* CHANNELS_SET */ const struct nla_policy ethnl_channels_set_policy[] = { [ETHTOOL_A_CHANNELS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_CHANNELS_RX_COUNT] = { .type = NLA_U32 }, [ETHTOOL_A_CHANNELS_TX_COUNT] = { .type = NLA_U32 }, [ETHTOOL_A_CHANNELS_OTHER_COUNT] = { .type = NLA_U32 }, [ETHTOOL_A_CHANNELS_COMBINED_COUNT] = { .type = NLA_U32 }, }; static int ethnl_set_channels_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_channels && ops->set_channels ? 1 : -EOPNOTSUPP; } static int ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) { unsigned int from_channel, old_total, i; bool mod = false, mod_combined = false; struct net_device *dev = req_info->dev; struct ethtool_channels channels = {}; struct nlattr **tb = info->attrs; u32 err_attr; int ret; dev->ethtool_ops->get_channels(dev, &channels); old_total = channels.combined_count + max(channels.rx_count, channels.tx_count); ethnl_update_u32(&channels.rx_count, tb[ETHTOOL_A_CHANNELS_RX_COUNT], &mod); ethnl_update_u32(&channels.tx_count, tb[ETHTOOL_A_CHANNELS_TX_COUNT], &mod); ethnl_update_u32(&channels.other_count, tb[ETHTOOL_A_CHANNELS_OTHER_COUNT], &mod); ethnl_update_u32(&channels.combined_count, tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod_combined); mod |= mod_combined; if (!mod) return 0; /* ensure new channel counts are within limits */ if (channels.rx_count > channels.max_rx) err_attr = ETHTOOL_A_CHANNELS_RX_COUNT; else if (channels.tx_count > channels.max_tx) err_attr = ETHTOOL_A_CHANNELS_TX_COUNT; else if (channels.other_count > channels.max_other) err_attr = ETHTOOL_A_CHANNELS_OTHER_COUNT; else if (channels.combined_count > channels.max_combined) err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT; else err_attr = 0; if (err_attr) { NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr], "requested channel count exceeds maximum"); return -EINVAL; } /* ensure there is at least one RX and one TX channel */ if (!channels.combined_count && !channels.rx_count) err_attr = ETHTOOL_A_CHANNELS_RX_COUNT; else if (!channels.combined_count && !channels.tx_count) err_attr = ETHTOOL_A_CHANNELS_TX_COUNT; else err_attr = 0; if (err_attr) { if (mod_combined) err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT; NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr], "requested channel counts would result in no RX or TX channel being configured"); return -EINVAL; } ret = ethtool_check_max_channel(dev, channels, info); if (ret) return ret; /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); for (i = from_channel; i < old_total; i++) if (xsk_get_pool_from_qid(dev, i)) { GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); return -EINVAL; } ret = dev->ethtool_ops->set_channels(dev, &channels); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_channels_request_ops = { .request_cmd = ETHTOOL_MSG_CHANNELS_GET, .reply_cmd = ETHTOOL_MSG_CHANNELS_GET_REPLY, .hdr_attr = ETHTOOL_A_CHANNELS_HEADER, .req_info_size = sizeof(struct channels_req_info), .reply_data_size = sizeof(struct channels_reply_data), .prepare_data = channels_prepare_data, .reply_size = channels_reply_size, .fill_reply = channels_fill_reply, .set_validate = ethnl_set_channels_validate, .set = ethnl_set_channels, .set_ntf_cmd = ETHTOOL_MSG_CHANNELS_NTF, }; |
4 15 2 1 1 10 1 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 | // SPDX-License-Identifier: GPL-2.0-only /* * Xtables module to match the process control group. * * Might be used to implement individual "per-application" firewall * policies in contrast to global policies based on control groups. * Matching is based upon processes tagged to net_cls' classid marker. * * (C) 2013 Daniel Borkmann <dborkman@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/skbuff.h> #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_cgroup.h> #include <net/sock.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_DESCRIPTION("Xtables: process control group matching"); MODULE_ALIAS("ipt_cgroup"); MODULE_ALIAS("ip6t_cgroup"); static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) { struct xt_cgroup_info_v0 *info = par->matchinfo; if (info->invert & ~1) return -EINVAL; return 0; } static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) { struct xt_cgroup_info_v1 *info = par->matchinfo; struct cgroup *cgrp; if ((info->invert_path & ~1) || (info->invert_classid & ~1)) return -EINVAL; if (!info->has_path && !info->has_classid) { pr_info("xt_cgroup: no path or classid specified\n"); return -EINVAL; } if (info->has_path && info->has_classid) { pr_info_ratelimited("path and classid specified\n"); return -EINVAL; } info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); if (IS_ERR(cgrp)) { pr_info_ratelimited("invalid path, errno=%ld\n", PTR_ERR(cgrp)); return -EINVAL; } info->priv = cgrp; } return 0; } static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) { struct xt_cgroup_info_v2 *info = par->matchinfo; struct cgroup *cgrp; if ((info->invert_path & ~1) || (info->invert_classid & ~1)) return -EINVAL; if (!info->has_path && !info->has_classid) { pr_info("xt_cgroup: no path or classid specified\n"); return -EINVAL; } if (info->has_path && info->has_classid) { pr_info_ratelimited("path and classid specified\n"); return -EINVAL; } info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); if (IS_ERR(cgrp)) { pr_info_ratelimited("invalid path, errno=%ld\n", PTR_ERR(cgrp)); return -EINVAL; } info->priv = cgrp; } return 0; } static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cgroup_info_v0 *info = par->matchinfo; struct sock *sk = skb->sk; if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ info->invert; } static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cgroup_info_v1 *info = par->matchinfo; struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; struct cgroup *ancestor = info->priv; struct sock *sk = skb->sk; if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; } static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_cgroup_info_v2 *info = par->matchinfo; struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; struct cgroup *ancestor = info->priv; struct sock *sk = skb->sk; if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) return false; if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; } static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) { struct xt_cgroup_info_v1 *info = par->matchinfo; if (info->priv) cgroup_put(info->priv); } static void cgroup_mt_destroy_v2(const struct xt_mtdtor_param *par) { struct xt_cgroup_info_v2 *info = par->matchinfo; if (info->priv) cgroup_put(info->priv); } static struct xt_match cgroup_mt_reg[] __read_mostly = { { .name = "cgroup", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = cgroup_mt_check_v0, .match = cgroup_mt_v0, .matchsize = sizeof(struct xt_cgroup_info_v0), .me = THIS_MODULE, .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), }, { .name = "cgroup", .revision = 1, .family = NFPROTO_UNSPEC, .checkentry = cgroup_mt_check_v1, .match = cgroup_mt_v1, .matchsize = sizeof(struct xt_cgroup_info_v1), .usersize = offsetof(struct xt_cgroup_info_v1, priv), .destroy = cgroup_mt_destroy_v1, .me = THIS_MODULE, .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), }, { .name = "cgroup", .revision = 2, .family = NFPROTO_UNSPEC, .checkentry = cgroup_mt_check_v2, .match = cgroup_mt_v2, .matchsize = sizeof(struct xt_cgroup_info_v2), .usersize = offsetof(struct xt_cgroup_info_v2, priv), .destroy = cgroup_mt_destroy_v2, .me = THIS_MODULE, .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), }, }; static int __init cgroup_mt_init(void) { return xt_register_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg)); } static void __exit cgroup_mt_exit(void) { xt_unregister_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg)); } module_init(cgroup_mt_init); module_exit(cgroup_mt_exit); |
1 1 7 9 8 42 42 3 3 20 19 7 2 2 11 3 1 10 7 2 1 2 1 5 4 4 2 2 2 2 17 17 7 10 13 4 2 15 11 4 15 17 2 2 1 1 1 1 1 1 1 1 1 1 1 5 1 18 19 1 18 18 18 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 * Portions Copyright (C) Christoph Hellwig, 2001-2002 */ #include <linux/fs.h> #include <linux/module.h> #include <linux/completion.h> #include <linux/vfs.h> #include <linux/quotaops.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/moduleparam.h> #include <linux/kthread.h> #include <linux/posix_acl.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/crc32.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/blkdev.h> #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_inode.h" #include "jfs_metapage.h" #include "jfs_superblock.h" #include "jfs_dmap.h" #include "jfs_imap.h" #include "jfs_acl.h" #include "jfs_debug.h" #include "jfs_xattr.h" #include "jfs_dinode.h" MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); MODULE_LICENSE("GPL"); static struct kmem_cache *jfs_inode_cachep; static const struct super_operations jfs_super_operations; static const struct export_operations jfs_export_operations; static struct file_system_type jfs_fs_type; #define MAX_COMMIT_THREADS 64 static int commit_threads; module_param(commit_threads, int, 0); MODULE_PARM_DESC(commit_threads, "Number of commit threads"); static struct task_struct *jfsCommitThread[MAX_COMMIT_THREADS]; struct task_struct *jfsIOthread; struct task_struct *jfsSyncThread; #ifdef CONFIG_JFS_DEBUG int jfsloglevel = JFS_LOGLEVEL_WARN; module_param(jfsloglevel, int, 0644); MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)"); #endif static void jfs_handle_error(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); if (sb_rdonly(sb)) return; updateSuper(sb, FM_DIRTY); if (sbi->flag & JFS_ERR_PANIC) panic("JFS (device %s): panic forced after error\n", sb->s_id); else if (sbi->flag & JFS_ERR_REMOUNT_RO) { jfs_err("ERROR: (device %s): remounting filesystem as read-only", sb->s_id); sb->s_flags |= SB_RDONLY; } /* nothing is done for continue beyond marking the superblock dirty */ } void jfs_error(struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err("ERROR: (device %s): %ps: %pV\n", sb->s_id, __builtin_return_address(0), &vaf); va_end(args); jfs_handle_error(sb); } static struct inode *jfs_alloc_inode(struct super_block *sb) { struct jfs_inode_info *jfs_inode; jfs_inode = alloc_inode_sb(sb, jfs_inode_cachep, GFP_NOFS); if (!jfs_inode) return NULL; #ifdef CONFIG_QUOTA memset(&jfs_inode->i_dquot, 0, sizeof(jfs_inode->i_dquot)); #endif return &jfs_inode->vfs_inode; } static void jfs_free_inode(struct inode *inode) { kmem_cache_free(jfs_inode_cachep, JFS_IP(inode)); } static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); s64 maxinodes; struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap; jfs_info("In jfs_statfs"); buf->f_type = JFS_SUPER_MAGIC; buf->f_bsize = sbi->bsize; buf->f_blocks = sbi->bmap->db_mapsize; buf->f_bfree = sbi->bmap->db_nfree; buf->f_bavail = sbi->bmap->db_nfree; /* * If we really return the number of allocated & free inodes, some * applications will fail because they won't see enough free inodes. * We'll try to calculate some guess as to how many inodes we can * really allocate * * buf->f_files = atomic_read(&imap->im_numinos); * buf->f_ffree = atomic_read(&imap->im_numfree); */ maxinodes = min((s64) atomic_read(&imap->im_numinos) + ((sbi->bmap->db_nfree >> imap->im_l2nbperiext) << L2INOSPEREXT), (s64) 0xffffffffLL); buf->f_files = maxinodes; buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) - atomic_read(&imap->im_numfree)); buf->f_fsid.val[0] = crc32_le(0, (char *)&sbi->uuid, sizeof(sbi->uuid)/2); buf->f_fsid.val[1] = crc32_le(0, (char *)&sbi->uuid + sizeof(sbi->uuid)/2, sizeof(sbi->uuid)/2); buf->f_namelen = JFS_NAME_MAX; return 0; } #ifdef CONFIG_QUOTA static int jfs_quota_off(struct super_block *sb, int type); static int jfs_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); static void jfs_quota_off_umount(struct super_block *sb) { int type; for (type = 0; type < MAXQUOTAS; type++) jfs_quota_off(sb, type); } static const struct quotactl_ops jfs_quotactl_ops = { .quota_on = jfs_quota_on, .quota_off = jfs_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk, .get_nextdqblk = dquot_get_next_dqblk, }; #else static inline void jfs_quota_off_umount(struct super_block *sb) { } #endif static void jfs_put_super(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); int rc; jfs_info("In jfs_put_super"); jfs_quota_off_umount(sb); rc = jfs_umount(sb); if (rc) jfs_err("jfs_umount failed with return code %d", rc); unload_nls(sbi->nls_tab); truncate_inode_pages(sbi->direct_inode->i_mapping, 0); iput(sbi->direct_inode); kfree(sbi); } enum { Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask, Opt_discard, Opt_nodiscard, Opt_discard_minblk }; static const struct constant_table jfs_param_errors[] = { {"continue", JFS_ERR_CONTINUE}, {"remount-ro", JFS_ERR_REMOUNT_RO}, {"panic", JFS_ERR_PANIC}, {} }; static const struct fs_parameter_spec jfs_param_spec[] = { fsparam_flag_no ("integrity", Opt_integrity), fsparam_string ("iocharset", Opt_iocharset), fsparam_u64 ("resize", Opt_resize), fsparam_flag ("resize", Opt_resize_nosize), fsparam_enum ("errors", Opt_errors, jfs_param_errors), fsparam_flag ("quota", Opt_quota), fsparam_flag ("noquota", Opt_ignore), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("grpquota", Opt_grpquota), fsparam_uid ("uid", Opt_uid), fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("umask", Opt_umask), fsparam_flag ("discard", Opt_discard), fsparam_u32 ("discard", Opt_discard_minblk), fsparam_flag ("nodiscard", Opt_nodiscard), {} }; struct jfs_context { int flag; kuid_t uid; kgid_t gid; uint umask; uint minblks_trim; void *nls_map; bool resize; s64 newLVSize; }; static int jfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct jfs_context *ctx = fc->fs_private; int reconfigure = (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE); struct fs_parse_result result; struct nls_table *nls_map; int opt; opt = fs_parse(fc, jfs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_integrity: if (result.negated) ctx->flag |= JFS_NOINTEGRITY; else ctx->flag &= ~JFS_NOINTEGRITY; break; case Opt_ignore: /* Silently ignore the quota options */ /* Don't do anything ;-) */ break; case Opt_iocharset: if (ctx->nls_map && ctx->nls_map != (void *) -1) { unload_nls(ctx->nls_map); ctx->nls_map = NULL; } if (!strcmp(param->string, "none")) ctx->nls_map = NULL; else { nls_map = load_nls(param->string); if (!nls_map) { pr_err("JFS: charset not found\n"); return -EINVAL; } ctx->nls_map = nls_map; } break; case Opt_resize: if (!reconfigure) return -EINVAL; ctx->resize = true; ctx->newLVSize = result.uint_64; break; case Opt_resize_nosize: if (!reconfigure) return -EINVAL; ctx->resize = true; break; case Opt_errors: ctx->flag &= ~JFS_ERR_MASK; ctx->flag |= result.uint_32; break; #ifdef CONFIG_QUOTA case Opt_quota: case Opt_usrquota: ctx->flag |= JFS_USRQUOTA; break; case Opt_grpquota: ctx->flag |= JFS_GRPQUOTA; break; #else case Opt_usrquota: case Opt_grpquota: case Opt_quota: pr_err("JFS: quota operations not supported\n"); break; #endif case Opt_uid: ctx->uid = result.uid; break; case Opt_gid: ctx->gid = result.gid; break; case Opt_umask: if (result.uint_32 & ~0777) { pr_err("JFS: Invalid value of umask\n"); return -EINVAL; } ctx->umask = result.uint_32; break; case Opt_discard: /* if set to 1, even copying files will cause * trimming :O * -> user has more control over the online trimming */ ctx->minblks_trim = 64; ctx->flag |= JFS_DISCARD; break; case Opt_nodiscard: ctx->flag &= ~JFS_DISCARD; break; case Opt_discard_minblk: ctx->minblks_trim = result.uint_32; ctx->flag |= JFS_DISCARD; break; default: return -EINVAL; } return 0; } static int jfs_reconfigure(struct fs_context *fc) { struct jfs_context *ctx = fc->fs_private; struct super_block *sb = fc->root->d_sb; int readonly = fc->sb_flags & SB_RDONLY; int rc = 0; int flag = ctx->flag; int ret; sync_filesystem(sb); /* Transfer results of parsing to the sbi */ JFS_SBI(sb)->flag = ctx->flag; JFS_SBI(sb)->uid = ctx->uid; JFS_SBI(sb)->gid = ctx->gid; JFS_SBI(sb)->umask = ctx->umask; JFS_SBI(sb)->minblks_trim = ctx->minblks_trim; if (ctx->nls_map != (void *) -1) { unload_nls(JFS_SBI(sb)->nls_tab); JFS_SBI(sb)->nls_tab = ctx->nls_map; } ctx->nls_map = NULL; if (ctx->resize) { if (sb_rdonly(sb)) { pr_err("JFS: resize requires volume to be mounted read-write\n"); return -EROFS; } if (!ctx->newLVSize) { ctx->newLVSize = sb_bdev_nr_blocks(sb); if (ctx->newLVSize == 0) pr_err("JFS: Cannot determine volume size\n"); } rc = jfs_extendfs(sb, ctx->newLVSize, 0); if (rc) return rc; } if (sb_rdonly(sb) && !readonly) { /* * Invalidate any previously read metadata. fsck may have * changed the on-disk data since we mounted r/o */ truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0); JFS_SBI(sb)->flag = flag; ret = jfs_mount_rw(sb, 1); /* mark the fs r/w for quota activity */ sb->s_flags &= ~SB_RDONLY; dquot_resume(sb, -1); return ret; } if (!sb_rdonly(sb) && readonly) { rc = dquot_suspend(sb, -1); if (rc < 0) return rc; rc = jfs_umount_rw(sb); JFS_SBI(sb)->flag = flag; return rc; } if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) { if (!sb_rdonly(sb)) { rc = jfs_umount_rw(sb); if (rc) return rc; JFS_SBI(sb)->flag = flag; ret = jfs_mount_rw(sb, 1); return ret; } } JFS_SBI(sb)->flag = flag; return 0; } static int jfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct jfs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; struct jfs_sb_info *sbi; struct inode *inode; int rc; int ret = -EINVAL; jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags); sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; sb->s_max_links = JFS_LINK_MAX; sb->s_time_min = 0; sb->s_time_max = U32_MAX; sbi->sb = sb; /* Transfer results of parsing to the sbi */ sbi->flag = ctx->flag; sbi->uid = ctx->uid; sbi->gid = ctx->gid; sbi->umask = ctx->umask; if (ctx->nls_map != (void *) -1) { unload_nls(sbi->nls_tab); sbi->nls_tab = ctx->nls_map; } ctx->nls_map = NULL; if (sbi->flag & JFS_DISCARD) { if (!bdev_max_discard_sectors(sb->s_bdev)) { pr_err("JFS: discard option not supported on device\n"); sbi->flag &= ~JFS_DISCARD; } else { sbi->minblks_trim = ctx->minblks_trim; } } #ifdef CONFIG_JFS_POSIX_ACL sb->s_flags |= SB_POSIXACL; #endif if (ctx->resize) { pr_err("resize option for remount only\n"); goto out_unload; } /* * Initialize blocksize to 4K. */ sb_set_blocksize(sb, PSIZE); /* * Set method vectors. */ sb->s_op = &jfs_super_operations; sb->s_export_op = &jfs_export_operations; sb->s_xattr = jfs_xattr_handlers; #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; sb->s_qcop = &jfs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif /* * Initialize direct-mapping inode/address-space */ inode = new_inode(sb); if (inode == NULL) { ret = -ENOMEM; goto out_unload; } inode->i_size = bdev_nr_bytes(sb->s_bdev); inode->i_mapping->a_ops = &jfs_metapage_aops; inode_fake_hash(inode); mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); sbi->direct_inode = inode; rc = jfs_mount(sb); if (rc) { if (!silent) jfs_err("jfs_mount failed w/return code = %d", rc); goto out_mount_failed; } if (sb_rdonly(sb)) sbi->log = NULL; else { rc = jfs_mount_rw(sb, 0); if (rc) { if (!silent) { jfs_err("jfs_mount_rw failed, return code = %d", rc); } goto out_no_rw; } } sb->s_magic = JFS_SUPER_MAGIC; if (sbi->mntflag & JFS_OS2) sb->s_d_op = &jfs_ci_dentry_operations; inode = jfs_iget(sb, ROOT_I); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto out_no_rw; } sb->s_root = d_make_root(inode); if (!sb->s_root) goto out_no_root; /* logical blocks are represented by 40 bits in pxd_t, etc. * and page cache is indexed by long */ sb->s_maxbytes = min(((loff_t)sb->s_blocksize) << 40, MAX_LFS_FILESIZE); sb->s_time_gran = 1; return 0; out_no_root: jfs_err("jfs_read_super: get root dentry failed"); out_no_rw: rc = jfs_umount(sb); if (rc) jfs_err("jfs_umount failed with return code %d", rc); out_mount_failed: filemap_write_and_wait(sbi->direct_inode->i_mapping); truncate_inode_pages(sbi->direct_inode->i_mapping, 0); make_bad_inode(sbi->direct_inode); iput(sbi->direct_inode); sbi->direct_inode = NULL; out_unload: unload_nls(sbi->nls_tab); kfree(sbi); return ret; } static int jfs_freeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; int rc = 0; if (!sb_rdonly(sb)) { txQuiesce(sb); rc = lmLogShutdown(log); if (rc) { jfs_error(sb, "lmLogShutdown failed\n"); /* let operations fail rather than hang */ txResume(sb); return rc; } rc = updateSuper(sb, FM_CLEAN); if (rc) { jfs_err("jfs_freeze: updateSuper failed"); /* * Don't fail here. Everything succeeded except * marking the superblock clean, so there's really * no harm in leaving it frozen for now. */ } } return 0; } static int jfs_unfreeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; int rc = 0; if (!sb_rdonly(sb)) { rc = updateSuper(sb, FM_MOUNT); if (rc) { jfs_error(sb, "updateSuper failed\n"); goto out; } rc = lmLogInit(log); if (rc) jfs_error(sb, "lmLogInit failed\n"); out: txResume(sb); } return rc; } static int jfs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, jfs_fill_super); } static int jfs_sync_fs(struct super_block *sb, int wait) { struct jfs_log *log = JFS_SBI(sb)->log; /* log == NULL indicates read-only mount */ if (log) { /* * Write quota structures to quota file, sync_blockdev() will * write them to disk later */ dquot_writeback_dquots(sb, -1); jfs_flush_journal(log, wait); jfs_syncpt(log, 0); } return 0; } static int jfs_show_options(struct seq_file *seq, struct dentry *root) { struct jfs_sb_info *sbi = JFS_SBI(root->d_sb); if (uid_valid(sbi->uid)) seq_printf(seq, ",uid=%d", from_kuid(&init_user_ns, sbi->uid)); if (gid_valid(sbi->gid)) seq_printf(seq, ",gid=%d", from_kgid(&init_user_ns, sbi->gid)); if (sbi->umask != -1) seq_printf(seq, ",umask=%03o", sbi->umask); if (sbi->flag & JFS_NOINTEGRITY) seq_puts(seq, ",nointegrity"); if (sbi->flag & JFS_DISCARD) seq_printf(seq, ",discard=%u", sbi->minblks_trim); if (sbi->nls_tab) seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset); if (sbi->flag & JFS_ERR_CONTINUE) seq_printf(seq, ",errors=continue"); if (sbi->flag & JFS_ERR_PANIC) seq_printf(seq, ",errors=panic"); #ifdef CONFIG_QUOTA if (sbi->flag & JFS_USRQUOTA) seq_puts(seq, ",usrquota"); if (sbi->flag & JFS_GRPQUOTA) seq_puts(seq, ",grpquota"); #endif return 0; } #ifdef CONFIG_QUOTA /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) * we don't have to be afraid of races */ static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; sector_t blk = off >> sb->s_blocksize_bits; int err = 0; int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; struct buffer_head tmp_bh; struct buffer_head *bh; loff_t i_size = i_size_read(inode); if (off > i_size) return 0; if (off+len > i_size) len = i_size-off; toread = len; while (toread > 0) { tocopy = min_t(size_t, sb->s_blocksize - offset, toread); tmp_bh.b_state = 0; tmp_bh.b_size = i_blocksize(inode); err = jfs_get_block(inode, blk, &tmp_bh, 0); if (err) return err; if (!buffer_mapped(&tmp_bh)) /* A hole? */ memset(data, 0, tocopy); else { bh = sb_bread(sb, tmp_bh.b_blocknr); if (!bh) return -EIO; memcpy(data, bh->b_data+offset, tocopy); brelse(bh); } offset = 0; toread -= tocopy; data += tocopy; blk++; } return len; } /* Write to quotafile */ static ssize_t jfs_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; sector_t blk = off >> sb->s_blocksize_bits; int err = 0; int offset = off & (sb->s_blocksize - 1); int tocopy; size_t towrite = len; struct buffer_head tmp_bh; struct buffer_head *bh; inode_lock(inode); while (towrite > 0) { tocopy = min_t(size_t, sb->s_blocksize - offset, towrite); tmp_bh.b_state = 0; tmp_bh.b_size = i_blocksize(inode); err = jfs_get_block(inode, blk, &tmp_bh, 1); if (err) goto out; if (offset || tocopy != sb->s_blocksize) bh = sb_bread(sb, tmp_bh.b_blocknr); else bh = sb_getblk(sb, tmp_bh.b_blocknr); if (!bh) { err = -EIO; goto out; } lock_buffer(bh); memcpy(bh->b_data+offset, data, tocopy); flush_dcache_page(bh->b_page); set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); brelse(bh); offset = 0; towrite -= tocopy; data += tocopy; blk++; } out: if (len == towrite) { inode_unlock(inode); return err; } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); inode_unlock(inode); return len - towrite; } static struct dquot __rcu **jfs_get_dquots(struct inode *inode) { return JFS_IP(inode)->i_dquot; } static int jfs_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { int err; struct inode *inode; err = dquot_quota_on(sb, type, format_id, path); if (err) return err; inode = d_inode(path->dentry); inode_lock(inode); JFS_IP(inode)->mode2 |= JFS_NOATIME_FL | JFS_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); inode_unlock(inode); mark_inode_dirty(inode); return 0; } static int jfs_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; int err; if (!inode || !igrab(inode)) goto out; err = dquot_quota_off(sb, type); if (err) goto out_put; inode_lock(inode); JFS_IP(inode)->mode2 &= ~(JFS_NOATIME_FL | JFS_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode_unlock(inode); mark_inode_dirty(inode); out_put: iput(inode); return err; out: return dquot_quota_off(sb, type); } #endif static const struct super_operations jfs_super_operations = { .alloc_inode = jfs_alloc_inode, .free_inode = jfs_free_inode, .dirty_inode = jfs_dirty_inode, .write_inode = jfs_write_inode, .evict_inode = jfs_evict_inode, .put_super = jfs_put_super, .sync_fs = jfs_sync_fs, .freeze_fs = jfs_freeze, .unfreeze_fs = jfs_unfreeze, .statfs = jfs_statfs, .show_options = jfs_show_options, #ifdef CONFIG_QUOTA .quota_read = jfs_quota_read, .quota_write = jfs_quota_write, .get_dquots = jfs_get_dquots, #endif }; static const struct export_operations jfs_export_operations = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = jfs_fh_to_dentry, .fh_to_parent = jfs_fh_to_parent, .get_parent = jfs_get_parent, }; static void jfs_init_options(struct fs_context *fc, struct jfs_context *ctx) { if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { struct super_block *sb = fc->root->d_sb; /* Copy over current option values and mount flags */ ctx->uid = JFS_SBI(sb)->uid; ctx->gid = JFS_SBI(sb)->gid; ctx->umask = JFS_SBI(sb)->umask; ctx->nls_map = (void *)-1; ctx->minblks_trim = JFS_SBI(sb)->minblks_trim; ctx->flag = JFS_SBI(sb)->flag; } else { /* * Initialize the mount flag and determine the default * error handler */ ctx->flag = JFS_ERR_REMOUNT_RO; ctx->uid = INVALID_UID; ctx->gid = INVALID_GID; ctx->umask = -1; ctx->nls_map = (void *)-1; } } static void jfs_free_fc(struct fs_context *fc) { struct jfs_context *ctx = fc->fs_private; if (ctx->nls_map != (void *) -1) unload_nls(ctx->nls_map); kfree(ctx); } static const struct fs_context_operations jfs_context_ops = { .parse_param = jfs_parse_param, .get_tree = jfs_get_tree, .reconfigure = jfs_reconfigure, .free = jfs_free_fc, }; static int jfs_init_fs_context(struct fs_context *fc) { struct jfs_context *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; jfs_init_options(fc, ctx); fc->fs_private = ctx; fc->ops = &jfs_context_ops; return 0; } static struct file_system_type jfs_fs_type = { .owner = THIS_MODULE, .name = "jfs", .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = jfs_init_fs_context, .parameters = jfs_param_spec, }; MODULE_ALIAS_FS("jfs"); static void init_once(void *foo) { struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo; memset(jfs_ip, 0, sizeof(struct jfs_inode_info)); INIT_LIST_HEAD(&jfs_ip->anon_inode_list); init_rwsem(&jfs_ip->rdwrlock); mutex_init(&jfs_ip->commit_mutex); init_rwsem(&jfs_ip->xattr_sem); spin_lock_init(&jfs_ip->ag_lock); jfs_ip->active_ag = -1; inode_init_once(&jfs_ip->vfs_inode); } static int __init init_jfs_fs(void) { int i; int rc; jfs_inode_cachep = kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, offsetof(struct jfs_inode_info, i_inline_all), sizeof_field(struct jfs_inode_info, i_inline_all), init_once); if (jfs_inode_cachep == NULL) return -ENOMEM; /* * Metapage initialization */ rc = metapage_init(); if (rc) { jfs_err("metapage_init failed w/rc = %d", rc); goto free_slab; } /* * Transaction Manager initialization */ rc = txInit(); if (rc) { jfs_err("txInit failed w/rc = %d", rc); goto free_metapage; } /* * I/O completion thread (endio) */ jfsIOthread = kthread_run(jfsIOWait, NULL, "jfsIO"); if (IS_ERR(jfsIOthread)) { rc = PTR_ERR(jfsIOthread); jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); goto end_txmngr; } if (commit_threads < 1) commit_threads = num_online_cpus(); if (commit_threads > MAX_COMMIT_THREADS) commit_threads = MAX_COMMIT_THREADS; for (i = 0; i < commit_threads; i++) { jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, "jfsCommit"); if (IS_ERR(jfsCommitThread[i])) { rc = PTR_ERR(jfsCommitThread[i]); jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); commit_threads = i; goto kill_committask; } } jfsSyncThread = kthread_run(jfs_sync, NULL, "jfsSync"); if (IS_ERR(jfsSyncThread)) { rc = PTR_ERR(jfsSyncThread); jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); goto kill_committask; } #ifdef PROC_FS_JFS jfs_proc_init(); #endif rc = register_filesystem(&jfs_fs_type); if (!rc) return 0; #ifdef PROC_FS_JFS jfs_proc_clean(); #endif kthread_stop(jfsSyncThread); kill_committask: for (i = 0; i < commit_threads; i++) kthread_stop(jfsCommitThread[i]); kthread_stop(jfsIOthread); end_txmngr: txExit(); free_metapage: metapage_exit(); free_slab: kmem_cache_destroy(jfs_inode_cachep); return rc; } static void __exit exit_jfs_fs(void) { int i; jfs_info("exit_jfs_fs called"); txExit(); metapage_exit(); kthread_stop(jfsIOthread); for (i = 0; i < commit_threads; i++) kthread_stop(jfsCommitThread[i]); kthread_stop(jfsSyncThread); #ifdef PROC_FS_JFS jfs_proc_clean(); #endif unregister_filesystem(&jfs_fs_type); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(jfs_inode_cachep); } module_init(init_jfs_fs) module_exit(exit_jfs_fs) |
6 6 6 9 8 9 450 302 68 1 29 432 421 9 9 3 9 483 515 497 41 6 491 9 496 501 483 512 511 515 518 516 514 1 516 515 520 2 485 46 2 2 32 48 235 317 1 536 537 1 50 514 523 351 258 440 163 541 543 34 1 50 27 541 539 81 192 476 507 538 539 57 2 57 57 45 48 2 47 23 29 8 2 1 25 6 11 3 37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 | // SPDX-License-Identifier: GPL-2.0 #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/sched.h> #include <linux/hugetlb.h> #include <linux/mmu_context.h> #include <linux/swap.h> #include <linux/swapops.h> #include <asm/tlbflush.h> #include "internal.h" /* * We want to know the real level where a entry is located ignoring any * folding of levels which may be happening. For example if p4d is folded then * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). */ static int real_depth(int depth) { if (depth == 3 && PTRS_PER_PMD == 1) depth = 2; if (depth == 2 && PTRS_PER_PUD == 1) depth = 1; if (depth == 1 && PTRS_PER_P4D == 1) depth = 0; return depth; } static int walk_pte_range_inner(pte_t *pte, unsigned long addr, unsigned long end, struct mm_walk *walk) { const struct mm_walk_ops *ops = walk->ops; int err = 0; for (;;) { if (ops->install_pte && pte_none(ptep_get(pte))) { pte_t new_pte; err = ops->install_pte(addr, addr + PAGE_SIZE, &new_pte, walk); if (err) break; set_pte_at(walk->mm, addr, pte, new_pte); /* Non-present before, so for arches that need it. */ if (!WARN_ON_ONCE(walk->no_vma)) update_mmu_cache(walk->vma, addr, pte); } else { err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) break; } if (addr >= end - PAGE_SIZE) break; addr += PAGE_SIZE; pte++; } return err; } static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { pte_t *pte; int err = 0; spinlock_t *ptl; if (walk->no_vma) { /* * pte_offset_map() might apply user-specific validation. * Indeed, on x86_64 the pmd entries set up by init_espfix_ap() * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear), * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them. */ if (walk->mm == &init_mm || addr >= TASK_SIZE) pte = pte_offset_kernel(pmd, addr); else pte = pte_offset_map(pmd, addr); if (pte) { err = walk_pte_range_inner(pte, addr, end, walk); if (walk->mm != &init_mm && addr < TASK_SIZE) pte_unmap(pte); } } else { pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (pte) { err = walk_pte_range_inner(pte, addr, end, walk); pte_unmap_unlock(pte, ptl); } } if (!pte) walk->action = ACTION_AGAIN; return err; } static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { pmd_t *pmd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; bool has_handler = ops->pte_entry; bool has_install = ops->install_pte; int err = 0; int depth = real_depth(3); pmd = pmd_offset(pud, addr); do { again: next = pmd_addr_end(addr, end); if (pmd_none(*pmd)) { if (has_install) err = __pte_alloc(walk->mm, pmd); else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; if (!has_install) continue; } walk->action = ACTION_SUBTREE; /* * This implies that each ->pmd_entry() handler * needs to know about pmd_trans_huge() pmds */ if (ops->pmd_entry) err = ops->pmd_entry(pmd, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; if (walk->action == ACTION_CONTINUE) continue; if (!has_handler) { /* No handlers for lower page tables. */ if (!has_install) continue; /* Nothing to do. */ /* * We are ONLY installing, so avoid unnecessarily * splitting a present huge page. */ if (pmd_present(*pmd) && (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) continue; } if (walk->vma) split_huge_pmd(walk->vma, pmd, addr); else if (pmd_leaf(*pmd) || !pmd_present(*pmd)) continue; /* Nothing to do. */ err = walk_pte_range(pmd, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; } while (pmd++, addr = next, addr != end); return err; } static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, struct mm_walk *walk) { pud_t *pud; unsigned long next; const struct mm_walk_ops *ops = walk->ops; bool has_handler = ops->pmd_entry || ops->pte_entry; bool has_install = ops->install_pte; int err = 0; int depth = real_depth(2); pud = pud_offset(p4d, addr); do { again: next = pud_addr_end(addr, end); if (pud_none(*pud)) { if (has_install) err = __pmd_alloc(walk->mm, pud, addr); else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; if (!has_install) continue; } walk->action = ACTION_SUBTREE; if (ops->pud_entry) err = ops->pud_entry(pud, addr, next, walk); if (err) break; if (walk->action == ACTION_AGAIN) goto again; if (walk->action == ACTION_CONTINUE) continue; if (!has_handler) { /* No handlers for lower page tables. */ if (!has_install) continue; /* Nothing to do. */ /* * We are ONLY installing, so avoid unnecessarily * splitting a present huge page. */ if (pud_present(*pud) && (pud_trans_huge(*pud) || pud_devmap(*pud))) continue; } if (walk->vma) split_huge_pud(walk->vma, pud, addr); else if (pud_leaf(*pud) || !pud_present(*pud)) continue; /* Nothing to do. */ if (pud_none(*pud)) goto again; err = walk_pmd_range(pud, addr, next, walk); if (err) break; } while (pud++, addr = next, addr != end); return err; } static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, struct mm_walk *walk) { p4d_t *p4d; unsigned long next; const struct mm_walk_ops *ops = walk->ops; bool has_handler = ops->pud_entry || ops->pmd_entry || ops->pte_entry; bool has_install = ops->install_pte; int err = 0; int depth = real_depth(1); p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) { if (has_install) err = __pud_alloc(walk->mm, p4d, addr); else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; if (!has_install) continue; } if (ops->p4d_entry) { err = ops->p4d_entry(p4d, addr, next, walk); if (err) break; } if (has_handler || has_install) err = walk_pud_range(p4d, addr, next, walk); if (err) break; } while (p4d++, addr = next, addr != end); return err; } static int walk_pgd_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { pgd_t *pgd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; bool has_handler = ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry; bool has_install = ops->install_pte; int err = 0; if (walk->pgd) pgd = walk->pgd + pgd_index(addr); else pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { if (has_install) err = __p4d_alloc(walk->mm, pgd, addr); else if (ops->pte_hole) err = ops->pte_hole(addr, next, 0, walk); if (err) break; if (!has_install) continue; } if (ops->pgd_entry) { err = ops->pgd_entry(pgd, addr, next, walk); if (err) break; } if (has_handler || has_install) err = walk_p4d_range(pgd, addr, next, walk); if (err) break; } while (pgd++, addr = next, addr != end); return err; } #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, unsigned long end) { unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); return boundary < end ? boundary : end; } static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); unsigned long sz = huge_page_size(h); pte_t *pte; const struct mm_walk_ops *ops = walk->ops; int err = 0; hugetlb_vma_lock_read(vma); do { next = hugetlb_entry_end(h, addr, end); pte = hugetlb_walk(vma, addr & hmask, sz); if (pte) err = ops->hugetlb_entry(pte, hmask, addr, next, walk); else if (ops->pte_hole) err = ops->pte_hole(addr, next, -1, walk); if (err) break; } while (addr = next, addr != end); hugetlb_vma_unlock_read(vma); return err; } #else /* CONFIG_HUGETLB_PAGE */ static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ /* * Decide whether we really walk over the current vma on [@start, @end) * or skip it via the returned value. Return 0 if we do walk over the * current vma, and return 1 if we skip the vma. Negative values means * error, where we abort the current walk. */ static int walk_page_test(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; const struct mm_walk_ops *ops = walk->ops; if (ops->test_walk) return ops->test_walk(start, end, walk); /* * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP * range, so we don't walk over it as we do for normal vmas. However, * Some callers are interested in handling hole range and they don't * want to just ignore any single address range. Such users certainly * define their ->pte_hole() callbacks, so let's delegate them to handle * vma(VM_PFNMAP). */ if (vma->vm_flags & VM_PFNMAP) { int err = 1; if (ops->pte_hole) err = ops->pte_hole(start, end, -1, walk); return err ? err : 1; } return 0; } static int __walk_page_range(unsigned long start, unsigned long end, struct mm_walk *walk) { int err = 0; struct vm_area_struct *vma = walk->vma; const struct mm_walk_ops *ops = walk->ops; bool is_hugetlb = is_vm_hugetlb_page(vma); /* We do not support hugetlb PTE installation. */ if (ops->install_pte && is_hugetlb) return -EINVAL; if (ops->pre_vma) { err = ops->pre_vma(start, end, walk); if (err) return err; } if (is_hugetlb) { if (ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else err = walk_pgd_range(start, end, walk); if (ops->post_vma) ops->post_vma(walk); return err; } static inline void process_mm_walk_lock(struct mm_struct *mm, enum page_walk_lock walk_lock) { if (walk_lock == PGWALK_RDLOCK) mmap_assert_locked(mm); else mmap_assert_write_locked(mm); } static inline void process_vma_walk_lock(struct vm_area_struct *vma, enum page_walk_lock walk_lock) { #ifdef CONFIG_PER_VMA_LOCK switch (walk_lock) { case PGWALK_WRLOCK: vma_start_write(vma); break; case PGWALK_WRLOCK_VERIFY: vma_assert_write_locked(vma); break; case PGWALK_RDLOCK: /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ break; } #endif } /* * See the comment for walk_page_range(), this performs the heavy lifting of the * operation, only sets no restrictions on how the walk proceeds. * * We usually restrict the ability to install PTEs, but this functionality is * available to internal memory management code and provided in mm/internal.h. */ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { int err = 0; unsigned long next; struct vm_area_struct *vma; struct mm_walk walk = { .ops = ops, .mm = mm, .private = private, }; if (start >= end) return -EINVAL; if (!walk.mm) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); vma = find_vma(walk.mm, start); do { if (!vma) { /* after the last vma */ walk.vma = NULL; next = end; if (ops->pte_hole) err = ops->pte_hole(start, next, -1, &walk); } else if (start < vma->vm_start) { /* outside vma */ walk.vma = NULL; next = min(end, vma->vm_start); if (ops->pte_hole) err = ops->pte_hole(start, next, -1, &walk); } else { /* inside vma */ process_vma_walk_lock(vma, ops->walk_lock); walk.vma = vma; next = min(end, vma->vm_end); vma = find_vma(mm, vma->vm_end); err = walk_page_test(start, next, &walk); if (err > 0) { /* * positive return values are purely for * controlling the pagewalk, so should never * be passed to the callers. */ err = 0; continue; } if (err < 0) break; err = __walk_page_range(start, next, &walk); } if (err) break; } while (start = next, start < end); return err; } /* * Determine if the walk operations specified are permitted to be used for a * page table walk. * * This check is performed on all functions which are parameterised by walk * operations and exposed in include/linux/pagewalk.h. * * Internal memory management code can use the walk_page_range_mm() function to * be able to use all page walking operations. */ static bool check_ops_valid(const struct mm_walk_ops *ops) { /* * The installation of PTEs is solely under the control of memory * management logic and subject to many subtle locking, security and * cache considerations so we cannot permit other users to do so, and * certainly not for exported symbols. */ if (ops->install_pte) return false; return true; } /** * walk_page_range - walk page table with caller specific callbacks * @mm: mm_struct representing the target process of page table walk * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk * @private: private data for callbacks' usage * * Recursively walk the page table tree of the process represented by @mm * within the virtual address range [@start, @end). During walking, we can do * some caller-specific works for each entry, by setting up pmd_entry(), * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these * callbacks, the associated entries/pages are just ignored. * The return values of these callbacks are commonly defined like below: * * - 0 : succeeded to handle the current entry, and if you don't reach the * end address yet, continue to walk. * - >0 : succeeded to handle the current entry, and return to the caller * with caller specific value. * - <0 : failed to handle the current entry, and return to the caller * with error code. * * Before starting to walk page table, some callers want to check whether * they really want to walk over the current vma, typically by checking * its vm_flags. walk_page_test() and @ops->test_walk() are used for this * purpose. * * If operations need to be staged before and committed after a vma is walked, * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), * since it is intended to handle commit-type operations, can't return any * errors. * * struct mm_walk keeps current values of some common data like vma and pmd, * which are useful for the access from callbacks. If you want to pass some * caller-specific data to callbacks, @private should be helpful. * * Locking: * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, * because these function traverse vma list and/or access to vma's data. */ int walk_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { if (!check_ops_valid(ops)) return -EINVAL; return walk_page_range_mm(mm, start, end, ops, private); } /** * walk_page_range_novma - walk a range of pagetables not backed by a vma * @mm: mm_struct representing the target process of page table walk * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk * @pgd: pgd to walk if different from mm->pgd * @private: private data for callbacks' usage * * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for * walking the kernel pages tables or page tables for firmware. * * Note: Be careful to walk the kernel pages tables, the caller may be need to * take other effective approaches (mmap lock may be insufficient) to prevent * the intermediate kernel page tables belonging to the specified address range * from being freed (e.g. memory hot-remove). */ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { struct mm_walk walk = { .ops = ops, .mm = mm, .pgd = pgd, .private = private, .no_vma = true }; if (start >= end || !walk.mm) return -EINVAL; if (!check_ops_valid(ops)) return -EINVAL; /* * 1) For walking the user virtual address space: * * The mmap lock protects the page walker from changes to the page * tables during the walk. However a read lock is insufficient to * protect those areas which don't have a VMA as munmap() detaches * the VMAs before downgrading to a read lock and actually tearing * down PTEs/page tables. In which case, the mmap write lock should * be hold. * * 2) For walking the kernel virtual address space: * * The kernel intermediate page tables usually do not be freed, so * the mmap map read lock is sufficient. But there are some exceptions. * E.g. memory hot-remove. In which case, the mmap lock is insufficient * to prevent the intermediate kernel pages tables belonging to the * specified address range from being freed. The caller should take * other actions to prevent this race. */ if (mm == &init_mm) mmap_assert_locked(walk.mm); else mmap_assert_write_locked(walk.mm); return walk_pgd_range(start, end, &walk); } int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, .mm = vma->vm_mm, .vma = vma, .private = private, }; if (start >= end || !walk.mm) return -EINVAL; if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; if (!check_ops_valid(ops)) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(start, end, &walk); } int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, .mm = vma->vm_mm, .vma = vma, .private = private, }; if (!walk.mm) return -EINVAL; if (!check_ops_valid(ops)) return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } /** * walk_page_mapping - walk all memory areas mapped into a struct address_space. * @mapping: Pointer to the struct address_space * @first_index: First page offset in the address_space * @nr: Number of incremental page offsets to cover * @ops: operation to call during the walk * @private: private data for callbacks' usage * * This function walks all memory areas mapped into a struct address_space. * The walk is limited to only the given page-size index range, but if * the index boundaries cross a huge page-table entry, that entry will be * included. * * Also see walk_page_range() for additional information. * * Locking: * This function can't require that the struct mm_struct::mmap_lock is held, * since @mapping may be mapped by multiple processes. Instead * @mapping->i_mmap_rwsem must be held. This might have implications in the * callbacks, and it's up tho the caller to ensure that the * struct mm_struct::mmap_lock is not needed. * * Also this means that a caller can't rely on the struct * vm_area_struct::vm_flags to be constant across a call, * except for immutable flags. Callers requiring this shouldn't use * this function. * * Return: 0 on success, negative error code on failure, positive number on * caller defined premature termination. */ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, const struct mm_walk_ops *ops, void *private) { struct mm_walk walk = { .ops = ops, .private = private, }; struct vm_area_struct *vma; pgoff_t vba, vea, cba, cea; unsigned long start_addr, end_addr; int err = 0; if (!check_ops_valid(ops)) return -EINVAL; lockdep_assert_held(&mapping->i_mmap_rwsem); vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, first_index + nr - 1) { /* Clip to the vma */ vba = vma->vm_pgoff; vea = vba + vma_pages(vma); cba = first_index; cba = max(cba, vba); cea = first_index + nr; cea = min(cea, vea); start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; if (start_addr >= end_addr) continue; walk.vma = vma; walk.mm = vma->vm_mm; err = walk_page_test(vma->vm_start, vma->vm_end, &walk); if (err > 0) { err = 0; break; } else if (err < 0) break; err = __walk_page_range(start_addr, end_addr, &walk); if (err) break; } return err; } /** * folio_walk_start - walk the page tables to a folio * @fw: filled with information on success. * @vma: the VMA. * @addr: the virtual address to use for the page table walk. * @flags: flags modifying which folios to walk to. * * Walk the page tables using @addr in a given @vma to a mapped folio and * return the folio, making sure that the page table entry referenced by * @addr cannot change until folio_walk_end() was called. * * As default, this function returns only folios that are not special (e.g., not * the zeropage) and never returns folios that are supposed to be ignored by the * VM as documented by vm_normal_page(). If requested, zeropages will be * returned as well. * * As default, this function only considers present page table entries. * If requested, it will also consider migration entries. * * If this function returns NULL it might either indicate "there is nothing" or * "there is nothing suitable". * * On success, @fw is filled and the function returns the folio while the PTL * is still held and folio_walk_end() must be called to clean up, * releasing any held locks. The returned folio must *not* be used after the * call to folio_walk_end(), unless a short-term folio reference is taken before * that call. * * @fw->page will correspond to the page that is effectively referenced by * @addr. However, for migration entries and shared zeropages @fw->page is * set to NULL. Note that large folios might be mapped by multiple page table * entries, and this function will always only lookup a single entry as * specified by @addr, which might or might not cover more than a single page of * the returned folio. * * This function must *not* be used as a naive replacement for * get_user_pages() / pin_user_pages(), especially not to perform DMA or * to carelessly modify page content. This function may *only* be used to grab * short-term folio references, never to grab long-term folio references. * * Using the page table entry pointers in @fw for reading or modifying the * entry should be avoided where possible: however, there might be valid * use cases. * * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. * For example, PMD page table sharing might require prior unsharing. Also, * logical hugetlb entries might span multiple physical page table entries, * which *must* be modified in a single operation (set_huge_pte_at(), * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might * not correspond to the first physical entry of a logical hugetlb entry. * * The mmap lock must be held in read mode. * * Return: folio pointer on success, otherwise NULL. */ struct folio *folio_walk_start(struct folio_walk *fw, struct vm_area_struct *vma, unsigned long addr, folio_walk_flags_t flags) { unsigned long entry_size; bool expose_page = true; struct page *page; pud_t *pudp, pud; pmd_t *pmdp, pmd; pte_t *ptep, pte; spinlock_t *ptl; pgd_t *pgdp; p4d_t *p4dp; mmap_assert_locked(vma->vm_mm); vma_pgtable_walk_begin(vma); if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) goto not_found; pgdp = pgd_offset(vma->vm_mm, addr); if (pgd_none_or_clear_bad(pgdp)) goto not_found; p4dp = p4d_offset(pgdp, addr); if (p4d_none_or_clear_bad(p4dp)) goto not_found; pudp = pud_offset(p4dp, addr); pud = pudp_get(pudp); if (pud_none(pud)) goto not_found; if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && (!pud_present(pud) || pud_leaf(pud))) { ptl = pud_lock(vma->vm_mm, pudp); pud = pudp_get(pudp); entry_size = PUD_SIZE; fw->level = FW_LEVEL_PUD; fw->pudp = pudp; fw->pud = pud; /* * TODO: FW_MIGRATION support for PUD migration entries * once there are relevant users. */ if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) { spin_unlock(ptl); goto not_found; } else if (!pud_leaf(pud)) { spin_unlock(ptl); goto pmd_table; } /* * TODO: vm_normal_page_pud() will be handy once we want to * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. */ page = pud_page(pud); goto found; } pmd_table: VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud)); pmdp = pmd_offset(pudp, addr); pmd = pmdp_get_lockless(pmdp); if (pmd_none(pmd)) goto not_found; if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && (!pmd_present(pmd) || pmd_leaf(pmd))) { ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_get(pmdp); entry_size = PMD_SIZE; fw->level = FW_LEVEL_PMD; fw->pmdp = pmdp; fw->pmd = pmd; if (pmd_none(pmd)) { spin_unlock(ptl); goto not_found; } else if (pmd_present(pmd) && !pmd_leaf(pmd)) { spin_unlock(ptl); goto pte_table; } else if (pmd_present(pmd)) { page = vm_normal_page_pmd(vma, addr, pmd); if (page) { goto found; } else if ((flags & FW_ZEROPAGE) && is_huge_zero_pmd(pmd)) { page = pfn_to_page(pmd_pfn(pmd)); expose_page = false; goto found; } } else if ((flags & FW_MIGRATION) && is_pmd_migration_entry(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); page = pfn_swap_entry_to_page(entry); expose_page = false; goto found; } spin_unlock(ptl); goto not_found; } pte_table: VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd)); ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); if (!ptep) goto not_found; pte = ptep_get(ptep); entry_size = PAGE_SIZE; fw->level = FW_LEVEL_PTE; fw->ptep = ptep; fw->pte = pte; if (pte_present(pte)) { page = vm_normal_page(vma, addr, pte); if (page) goto found; if ((flags & FW_ZEROPAGE) && is_zero_pfn(pte_pfn(pte))) { page = pfn_to_page(pte_pfn(pte)); expose_page = false; goto found; } } else if (!pte_none(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); if ((flags & FW_MIGRATION) && is_migration_entry(entry)) { page = pfn_swap_entry_to_page(entry); expose_page = false; goto found; } } pte_unmap_unlock(ptep, ptl); not_found: vma_pgtable_walk_end(vma); return NULL; found: if (expose_page) /* Note: Offset from the mapped page, not the folio start. */ fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); else fw->page = NULL; fw->ptl = ptl; return page_folio(page); } |
19 19 19 14 5 5 14 19 19 19 19 19 19 19 19 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2013 Jie Liu. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_trans_space.h" #include "xfs_da_btree.h" #include "xfs_bmap_btree.h" #include "xfs_trace.h" /* * Shortly after enabling the large extents count feature in 2023, longstanding * bugs were found in the code that computes the minimum log size. Luckily, * the bugs resulted in over-estimates of that size, so there's no impact to * existing users. However, we don't want to reduce the minimum log size * because that can create the situation where a newer mkfs writes a new * filesystem that an older kernel won't mount. * * Several years prior, we also discovered that the transaction reservations * for rmap and reflink operations were unnecessarily large. That was fixed, * but the minimum log size computation was left alone to avoid the * compatibility problems noted above. Fix that too. * * Therefore, we only may correct the computation starting with filesystem * features that didn't exist in 2023. In other words, only turn this on if * the filesystem has parent pointers. * * This function can be called before the XFS_HAS_* flags have been set up, * (e.g. mkfs) so we must check the ondisk superblock. */ static inline bool xfs_want_minlogsize_fixes( struct xfs_sb *sb) { return xfs_sb_is_v5(sb) && xfs_sb_has_incompat_feature(sb, XFS_SB_FEAT_INCOMPAT_PARENT); } /* * Calculate the maximum length in bytes that would be required for a local * attribute value as large attributes out of line are not logged. */ STATIC int xfs_log_calc_max_attrsetm_res( struct xfs_mount *mp) { int size; int nblks; size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) - MAXNAMELEN - 1; nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); nblks += XFS_B_TO_FSB(mp, size); /* * If the feature set is new enough, correct a unit conversion error in * the xattr transaction reservation code that resulted in oversized * minimum log size computations. */ if (xfs_want_minlogsize_fixes(&mp->m_sb)) size = XFS_B_TO_FSB(mp, size); nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK); return M_RES(mp)->tr_attrsetm.tr_logres + M_RES(mp)->tr_attrsetrt.tr_logres * nblks; } /* * Compute an alternate set of log reservation sizes for use exclusively with * minimum log size calculations. */ static void xfs_log_calc_trans_resv_for_minlogblocks( struct xfs_mount *mp, struct xfs_trans_resv *resv) { unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; /* * If the feature set is new enough, drop the oversized minimum log * size computation introduced by the original reflink code. */ if (xfs_want_minlogsize_fixes(&mp->m_sb)) { xfs_trans_resv_calc(mp, resv); return; } /* * In the early days of rmap+reflink, we always set the rmap maxlevels * to 9 even if the AG was small enough that it would never grow to * that height. Transaction reservation sizes influence the minimum * log size calculation, which influences the size of the log that mkfs * creates. Use the old value here to ensure that newly formatted * small filesystems will mount on older kernels. */ if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; xfs_trans_resv_calc(mp, resv); if (xfs_has_reflink(mp)) { /* * In the early days of reflink, typical log operation counts * were greatly overestimated. */ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT_REFLINK; resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; } else if (xfs_has_rmapbt(mp)) { /* * In the early days of non-reflink rmap, the impact of rmapbt * updates on log counts were not taken into account at all. */ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; } /* * In the early days of reflink, we did not use deferred refcount * update log items, so log reservations must be recomputed using the * old calculations. */ resv->tr_write.tr_logres = xfs_calc_write_reservation_minlogsize(mp); resv->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation_minlogsize(mp); resv->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation_minlogsize(mp); /* Put everything back the way it was. This goes at the end. */ mp->m_rmap_maxlevels = rmap_maxlevels; } /* * Iterate over the log space reservation table to figure out and return * the maximum one in terms of the pre-calculated values which were done * at mount time. */ void xfs_log_get_max_trans_res( struct xfs_mount *mp, struct xfs_trans_res *max_resp) { struct xfs_trans_resv resv = {}; struct xfs_trans_res *resp; struct xfs_trans_res *end_resp; unsigned int i; int log_space = 0; int attr_space; attr_space = xfs_log_calc_max_attrsetm_res(mp); xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv); resp = (struct xfs_trans_res *)&resv; end_resp = (struct xfs_trans_res *)(&resv + 1); for (i = 0; resp < end_resp; i++, resp++) { int tmp = resp->tr_logcount > 1 ? resp->tr_logres * resp->tr_logcount : resp->tr_logres; trace_xfs_trans_resv_calc_minlogsize(mp, i, resp); if (log_space < tmp) { log_space = tmp; *max_resp = *resp; /* struct copy */ } } if (attr_space > log_space) { *max_resp = resv.tr_attrsetm; /* struct copy */ max_resp->tr_logres = attr_space; } trace_xfs_log_get_max_trans_res(mp, max_resp); } /* * Calculate the minimum valid log size for the given superblock configuration. * Used to calculate the minimum log size at mkfs time, and to determine if * the log is large enough or not at mount time. Returns the minimum size in * filesystem block size units. */ int xfs_log_calc_minimum_size( struct xfs_mount *mp) { struct xfs_trans_res tres = {0}; int max_logres; int min_logblks = 0; int lsunit = 0; xfs_log_get_max_trans_res(mp, &tres); max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres); if (tres.tr_logcount > 1) max_logres *= tres.tr_logcount; if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1) lsunit = BTOBB(mp->m_sb.sb_logsunit); /* * Two factors should be taken into account for calculating the minimum * log space. * 1) The fundamental limitation is that no single transaction can be * larger than half size of the log. * * From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR * define, which is set to 3. That means we can definitely fit * maximally sized 2 transactions in the log. We'll use this same * value here. * * 2) If the lsunit option is specified, a transaction requires 2 LSU * for the reservation because there are two log writes that can * require padding - the transaction data and the commit record which * are written separately and both can require padding to the LSU. * Consider that we can have an active CIL reservation holding 2*LSU, * but the CIL is not over a push threshold, in this case, if we * don't have enough log space for at one new transaction, which * includes another 2*LSU in the reservation, we will run into dead * loop situation in log space grant procedure. i.e. * xlog_grant_head_wait(). * * Hence the log size needs to be able to contain two maximally sized * and padded transactions, which is (2 * (2 * LSU + maxlres)). * * Also, the log size should be a multiple of the log stripe unit, round * it up to lsunit boundary if lsunit is specified. */ if (lsunit) { min_logblks = roundup_64(BTOBB(max_logres), lsunit) + 2 * lsunit; } else min_logblks = BTOBB(max_logres) + 2 * BBSIZE; min_logblks *= XFS_MIN_LOG_FACTOR; return XFS_BB_TO_FSB(mp, min_logblks); } |
120 3 3 3 19 18 19 16 16 16 45 45 16 37 45 22 3 43 44 39 15 44 44 22 22 27 27 27 22 22 22 22 1 22 22 22 19 19 19 19 19 19 43 44 42 3 44 44 4 4 4 4 16 16 16 16 25 25 25 68 70 70 29 29 29 29 1 1 1 1 1 1 1 1 2 1 2 1 4 1 3 3 4 3 1 2 3 11 11 11 1 1 2 2 15 5 5 5 5 5 1 4 2 2 2 2 1 3 3 3 1 1 1 1 63 60 4 1 1 5 54 2 3 1 1 47 54 25 25 22 23 23 4 6 19 1 23 10 10 8 8 7 8 8 62 15 60 62 1 2 61 17 17 17 17 27 10 10 10 10 10 24 24 24 17 24 21 21 2 43 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 | // SPDX-License-Identifier: GPL-2.0+ /* * Driver core for serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * * Copyright 1999 ARM Limited * Copyright (C) 2000-2001 Deep Blue Solutions Ltd. */ #include <linux/module.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/init.h> #include <linux/console.h> #include <linux/gpio/consumer.h> #include <linux/kernel.h> #include <linux/of.h> #include <linux/pm_runtime.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/device.h> #include <linux/serial.h> /* for serial_state and serial_icounter_struct */ #include <linux/serial_core.h> #include <linux/sysrq.h> #include <linux/delay.h> #include <linux/mutex.h> #include <linux/math64.h> #include <linux/security.h> #include <linux/irq.h> #include <linux/uaccess.h> #include "serial_base.h" /* * This is used to lock changes in serial line configuration. */ static DEFINE_MUTEX(port_mutex); /* * lockdep: port->lock is initialized in two places, but we * want only one lock-class: */ static struct lock_class_key port_lock_key; #define HIGH_BITS_OFFSET ((sizeof(long)-sizeof(int))*8) /* * Max time with active RTS before/after data is sent. */ #define RS485_MAX_RTS_DELAY 100 /* msecs */ static void uart_change_pm(struct uart_state *state, enum uart_pm_state pm_state); static void uart_port_shutdown(struct tty_port *port); static int uart_dcd_enabled(struct uart_port *uport) { return !!(uport->status & UPSTAT_DCD_ENABLE); } static inline struct uart_port *uart_port_ref(struct uart_state *state) { if (atomic_add_unless(&state->refcount, 1, 0)) return state->uart_port; return NULL; } static inline void uart_port_deref(struct uart_port *uport) { if (atomic_dec_and_test(&uport->state->refcount)) wake_up(&uport->state->remove_wait); } #define uart_port_lock(state, flags) \ ({ \ struct uart_port *__uport = uart_port_ref(state); \ if (__uport) \ uart_port_lock_irqsave(__uport, &flags); \ __uport; \ }) #define uart_port_unlock(uport, flags) \ ({ \ struct uart_port *__uport = uport; \ if (__uport) { \ uart_port_unlock_irqrestore(__uport, flags); \ uart_port_deref(__uport); \ } \ }) static inline struct uart_port *uart_port_check(struct uart_state *state) { lockdep_assert_held(&state->port.mutex); return state->uart_port; } /** * uart_write_wakeup - schedule write processing * @port: port to be processed * * This routine is used by the interrupt handler to schedule processing in the * software interrupt portion of the driver. A driver is expected to call this * function when the number of characters in the transmit buffer have dropped * below a threshold. * * Locking: @port->lock should be held */ void uart_write_wakeup(struct uart_port *port) { struct uart_state *state = port->state; /* * This means you called this function _after_ the port was * closed. No cookie for you. */ BUG_ON(!state); tty_port_tty_wakeup(&state->port); } EXPORT_SYMBOL(uart_write_wakeup); static void uart_stop(struct tty_struct *tty) { struct uart_state *state = tty->driver_data; struct uart_port *port; unsigned long flags; port = uart_port_lock(state, flags); if (port) port->ops->stop_tx(port); uart_port_unlock(port, flags); } static void __uart_start(struct uart_state *state) { struct uart_port *port = state->uart_port; struct serial_port_device *port_dev; int err; if (!port || port->flags & UPF_DEAD || uart_tx_stopped(port)) return; port_dev = port->port_dev; /* Increment the runtime PM usage count for the active check below */ err = pm_runtime_get(&port_dev->dev); if (err < 0 && err != -EINPROGRESS) { pm_runtime_put_noidle(&port_dev->dev); return; } /* * Start TX if enabled, and kick runtime PM. If the device is not * enabled, serial_port_runtime_resume() calls start_tx() again * after enabling the device. */ if (!pm_runtime_enabled(port->dev) || pm_runtime_active(&port_dev->dev)) port->ops->start_tx(port); pm_runtime_mark_last_busy(&port_dev->dev); pm_runtime_put_autosuspend(&port_dev->dev); } static void uart_start(struct tty_struct *tty) { struct uart_state *state = tty->driver_data; struct uart_port *port; unsigned long flags; port = uart_port_lock(state, flags); __uart_start(state); uart_port_unlock(port, flags); } static void uart_update_mctrl(struct uart_port *port, unsigned int set, unsigned int clear) { unsigned long flags; unsigned int old; uart_port_lock_irqsave(port, &flags); old = port->mctrl; port->mctrl = (old & ~clear) | set; if (old != port->mctrl && !(port->rs485.flags & SER_RS485_ENABLED)) port->ops->set_mctrl(port, port->mctrl); uart_port_unlock_irqrestore(port, flags); } #define uart_set_mctrl(port, set) uart_update_mctrl(port, set, 0) #define uart_clear_mctrl(port, clear) uart_update_mctrl(port, 0, clear) static void uart_port_dtr_rts(struct uart_port *uport, bool active) { if (active) uart_set_mctrl(uport, TIOCM_DTR | TIOCM_RTS); else uart_clear_mctrl(uport, TIOCM_DTR | TIOCM_RTS); } /* Caller holds port mutex */ static void uart_change_line_settings(struct tty_struct *tty, struct uart_state *state, const struct ktermios *old_termios) { struct uart_port *uport = uart_port_check(state); struct ktermios *termios; bool old_hw_stopped; /* * If we have no tty, termios, or the port does not exist, * then we can't set the parameters for this port. */ if (!tty || uport->type == PORT_UNKNOWN) return; termios = &tty->termios; uport->ops->set_termios(uport, termios, old_termios); /* * Set modem status enables based on termios cflag */ uart_port_lock_irq(uport); if (termios->c_cflag & CRTSCTS) uport->status |= UPSTAT_CTS_ENABLE; else uport->status &= ~UPSTAT_CTS_ENABLE; if (termios->c_cflag & CLOCAL) uport->status &= ~UPSTAT_DCD_ENABLE; else uport->status |= UPSTAT_DCD_ENABLE; /* reset sw-assisted CTS flow control based on (possibly) new mode */ old_hw_stopped = uport->hw_stopped; uport->hw_stopped = uart_softcts_mode(uport) && !(uport->ops->get_mctrl(uport) & TIOCM_CTS); if (uport->hw_stopped != old_hw_stopped) { if (!old_hw_stopped) uport->ops->stop_tx(uport); else __uart_start(state); } uart_port_unlock_irq(uport); } static int uart_alloc_xmit_buf(struct tty_port *port) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport; unsigned long flags; unsigned long page; /* * Initialise and allocate the transmit and temporary * buffer. */ page = get_zeroed_page(GFP_KERNEL); if (!page) return -ENOMEM; uport = uart_port_lock(state, flags); if (!state->port.xmit_buf) { state->port.xmit_buf = (unsigned char *)page; kfifo_init(&state->port.xmit_fifo, state->port.xmit_buf, PAGE_SIZE); uart_port_unlock(uport, flags); } else { uart_port_unlock(uport, flags); /* * Do not free() the page under the port lock, see * uart_free_xmit_buf(). */ free_page(page); } return 0; } static void uart_free_xmit_buf(struct tty_port *port) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport; unsigned long flags; char *xmit_buf; /* * Do not free() the transmit buffer page under the port lock since * this can create various circular locking scenarios. For instance, * console driver may need to allocate/free a debug object, which * can end up in printk() recursion. */ uport = uart_port_lock(state, flags); xmit_buf = port->xmit_buf; port->xmit_buf = NULL; INIT_KFIFO(port->xmit_fifo); uart_port_unlock(uport, flags); free_page((unsigned long)xmit_buf); } /* * Startup the port. This will be called once per open. All calls * will be serialised by the per-port mutex. */ static int uart_port_startup(struct tty_struct *tty, struct uart_state *state, bool init_hw) { struct uart_port *uport = uart_port_check(state); int retval; if (uport->type == PORT_UNKNOWN) return 1; /* * Make sure the device is in D0 state. */ uart_change_pm(state, UART_PM_STATE_ON); retval = uart_alloc_xmit_buf(&state->port); if (retval) return retval; retval = uport->ops->startup(uport); if (retval == 0) { if (uart_console(uport) && uport->cons->cflag) { tty->termios.c_cflag = uport->cons->cflag; tty->termios.c_ispeed = uport->cons->ispeed; tty->termios.c_ospeed = uport->cons->ospeed; uport->cons->cflag = 0; uport->cons->ispeed = 0; uport->cons->ospeed = 0; } /* * Initialise the hardware port settings. */ uart_change_line_settings(tty, state, NULL); /* * Setup the RTS and DTR signals once the * port is open and ready to respond. */ if (init_hw && C_BAUD(tty)) uart_port_dtr_rts(uport, true); } /* * This is to allow setserial on this port. People may want to set * port/irq/type and then reconfigure the port properly if it failed * now. */ if (retval && capable(CAP_SYS_ADMIN)) return 1; return retval; } static int uart_startup(struct tty_struct *tty, struct uart_state *state, bool init_hw) { struct tty_port *port = &state->port; struct uart_port *uport; int retval; if (tty_port_initialized(port)) goto out_base_port_startup; retval = uart_port_startup(tty, state, init_hw); if (retval) { set_bit(TTY_IO_ERROR, &tty->flags); return retval; } out_base_port_startup: uport = uart_port_check(state); if (!uport) return -EIO; serial_base_port_startup(uport); return 0; } /* * This routine will shutdown a serial port; interrupts are disabled, and * DTR is dropped if the hangup on close termio flag is on. Calls to * uart_shutdown are serialised by the per-port semaphore. * * uport == NULL if uart_port has already been removed */ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state) { struct uart_port *uport = uart_port_check(state); struct tty_port *port = &state->port; /* * Set the TTY IO error marker */ if (tty) set_bit(TTY_IO_ERROR, &tty->flags); if (uport) serial_base_port_shutdown(uport); if (tty_port_initialized(port)) { tty_port_set_initialized(port, false); /* * Turn off DTR and RTS early. */ if (uport) { if (uart_console(uport) && tty) { uport->cons->cflag = tty->termios.c_cflag; uport->cons->ispeed = tty->termios.c_ispeed; uport->cons->ospeed = tty->termios.c_ospeed; } if (!tty || C_HUPCL(tty)) uart_port_dtr_rts(uport, false); } uart_port_shutdown(port); } /* * It's possible for shutdown to be called after suspend if we get * a DCD drop (hangup) at just the right time. Clear suspended bit so * we don't try to resume a port that has been shutdown. */ tty_port_set_suspended(port, false); uart_free_xmit_buf(port); } /** * uart_update_timeout - update per-port frame timing information * @port: uart_port structure describing the port * @cflag: termios cflag value * @baud: speed of the port * * Set the @port frame timing information from which the FIFO timeout value is * derived. The @cflag value should reflect the actual hardware settings as * number of bits, parity, stop bits and baud rate is taken into account here. * * Locking: caller is expected to take @port->lock */ void uart_update_timeout(struct uart_port *port, unsigned int cflag, unsigned int baud) { u64 temp = tty_get_frame_size(cflag); temp *= NSEC_PER_SEC; port->frame_time = (unsigned int)DIV64_U64_ROUND_UP(temp, baud); } EXPORT_SYMBOL(uart_update_timeout); /** * uart_get_baud_rate - return baud rate for a particular port * @port: uart_port structure describing the port in question. * @termios: desired termios settings * @old: old termios (or %NULL) * @min: minimum acceptable baud rate * @max: maximum acceptable baud rate * * Decode the termios structure into a numeric baud rate, taking account of the * magic 38400 baud rate (with spd_* flags), and mapping the %B0 rate to 9600 * baud. * * If the new baud rate is invalid, try the @old termios setting. If it's still * invalid, we try 9600 baud. If that is also invalid 0 is returned. * * The @termios structure is updated to reflect the baud rate we're actually * going to be using. Don't do this for the case where B0 is requested ("hang * up"). * * Locking: caller dependent */ unsigned int uart_get_baud_rate(struct uart_port *port, struct ktermios *termios, const struct ktermios *old, unsigned int min, unsigned int max) { unsigned int try; unsigned int baud; unsigned int altbaud; int hung_up = 0; upf_t flags = port->flags & UPF_SPD_MASK; switch (flags) { case UPF_SPD_HI: altbaud = 57600; break; case UPF_SPD_VHI: altbaud = 115200; break; case UPF_SPD_SHI: altbaud = 230400; break; case UPF_SPD_WARP: altbaud = 460800; break; default: altbaud = 38400; break; } for (try = 0; try < 2; try++) { baud = tty_termios_baud_rate(termios); /* * The spd_hi, spd_vhi, spd_shi, spd_warp kludge... * Die! Die! Die! */ if (try == 0 && baud == 38400) baud = altbaud; /* * Special case: B0 rate. */ if (baud == 0) { hung_up = 1; baud = 9600; } if (baud >= min && baud <= max) return baud; /* * Oops, the quotient was zero. Try again with * the old baud rate if possible. */ termios->c_cflag &= ~CBAUD; if (old) { baud = tty_termios_baud_rate(old); if (!hung_up) tty_termios_encode_baud_rate(termios, baud, baud); old = NULL; continue; } /* * As a last resort, if the range cannot be met then clip to * the nearest chip supported rate. */ if (!hung_up) { if (baud <= min) tty_termios_encode_baud_rate(termios, min + 1, min + 1); else tty_termios_encode_baud_rate(termios, max - 1, max - 1); } } return 0; } EXPORT_SYMBOL(uart_get_baud_rate); /** * uart_get_divisor - return uart clock divisor * @port: uart_port structure describing the port * @baud: desired baud rate * * Calculate the divisor (baud_base / baud) for the specified @baud, * appropriately rounded. * * If 38400 baud and custom divisor is selected, return the custom divisor * instead. * * Locking: caller dependent */ unsigned int uart_get_divisor(struct uart_port *port, unsigned int baud) { unsigned int quot; /* * Old custom speed handling. */ if (baud == 38400 && (port->flags & UPF_SPD_MASK) == UPF_SPD_CUST) quot = port->custom_divisor; else quot = DIV_ROUND_CLOSEST(port->uartclk, 16 * baud); return quot; } EXPORT_SYMBOL(uart_get_divisor); static int uart_put_char(struct tty_struct *tty, u8 c) { struct uart_state *state = tty->driver_data; struct uart_port *port; unsigned long flags; int ret = 0; port = uart_port_lock(state, flags); if (!state->port.xmit_buf) { uart_port_unlock(port, flags); return 0; } if (port) ret = kfifo_put(&state->port.xmit_fifo, c); uart_port_unlock(port, flags); return ret; } static void uart_flush_chars(struct tty_struct *tty) { uart_start(tty); } static ssize_t uart_write(struct tty_struct *tty, const u8 *buf, size_t count) { struct uart_state *state = tty->driver_data; struct uart_port *port; unsigned long flags; int ret = 0; /* * This means you called this function _after_ the port was * closed. No cookie for you. */ if (WARN_ON(!state)) return -EL3HLT; port = uart_port_lock(state, flags); if (!state->port.xmit_buf) { uart_port_unlock(port, flags); return 0; } if (port) ret = kfifo_in(&state->port.xmit_fifo, buf, count); __uart_start(state); uart_port_unlock(port, flags); return ret; } static unsigned int uart_write_room(struct tty_struct *tty) { struct uart_state *state = tty->driver_data; struct uart_port *port; unsigned long flags; unsigned int ret; port = uart_port_lock(state, flags); ret = kfifo_avail(&state->port.xmit_fifo); uart_port_unlock(port, flags); return ret; } static unsigned int uart_chars_in_buffer(struct tty_struct *tty) { struct uart_state *state = tty->driver_data; struct uart_port *port; unsigned long flags; unsigned int ret; port = uart_port_lock(state, flags); ret = kfifo_len(&state->port.xmit_fifo); uart_port_unlock(port, flags); return ret; } static void uart_flush_buffer(struct tty_struct *tty) { struct uart_state *state = tty->driver_data; struct uart_port *port; unsigned long flags; /* * This means you called this function _after_ the port was * closed. No cookie for you. */ if (WARN_ON(!state)) return; pr_debug("uart_flush_buffer(%d) called\n", tty->index); port = uart_port_lock(state, flags); if (!port) return; kfifo_reset(&state->port.xmit_fifo); if (port->ops->flush_buffer) port->ops->flush_buffer(port); uart_port_unlock(port, flags); tty_port_tty_wakeup(&state->port); } /* * This function performs low-level write of high-priority XON/XOFF * character and accounting for it. * * Requires uart_port to implement .serial_out(). */ void uart_xchar_out(struct uart_port *uport, int offset) { serial_port_out(uport, offset, uport->x_char); uport->icount.tx++; uport->x_char = 0; } EXPORT_SYMBOL_GPL(uart_xchar_out); /* * This function is used to send a high-priority XON/XOFF character to * the device */ static void uart_send_xchar(struct tty_struct *tty, u8 ch) { struct uart_state *state = tty->driver_data; struct uart_port *port; unsigned long flags; port = uart_port_ref(state); if (!port) return; if (port->ops->send_xchar) port->ops->send_xchar(port, ch); else { uart_port_lock_irqsave(port, &flags); port->x_char = ch; if (ch) port->ops->start_tx(port); uart_port_unlock_irqrestore(port, flags); } uart_port_deref(port); } static void uart_throttle(struct tty_struct *tty) { struct uart_state *state = tty->driver_data; upstat_t mask = UPSTAT_SYNC_FIFO; struct uart_port *port; port = uart_port_ref(state); if (!port) return; if (I_IXOFF(tty)) mask |= UPSTAT_AUTOXOFF; if (C_CRTSCTS(tty)) mask |= UPSTAT_AUTORTS; if (port->status & mask) { port->ops->throttle(port); mask &= ~port->status; } if (mask & UPSTAT_AUTORTS) uart_clear_mctrl(port, TIOCM_RTS); if (mask & UPSTAT_AUTOXOFF) uart_send_xchar(tty, STOP_CHAR(tty)); uart_port_deref(port); } static void uart_unthrottle(struct tty_struct *tty) { struct uart_state *state = tty->driver_data; upstat_t mask = UPSTAT_SYNC_FIFO; struct uart_port *port; port = uart_port_ref(state); if (!port) return; if (I_IXOFF(tty)) mask |= UPSTAT_AUTOXOFF; if (C_CRTSCTS(tty)) mask |= UPSTAT_AUTORTS; if (port->status & mask) { port->ops->unthrottle(port); mask &= ~port->status; } if (mask & UPSTAT_AUTORTS) uart_set_mctrl(port, TIOCM_RTS); if (mask & UPSTAT_AUTOXOFF) uart_send_xchar(tty, START_CHAR(tty)); uart_port_deref(port); } static int uart_get_info(struct tty_port *port, struct serial_struct *retinfo) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport; int ret = -ENODEV; /* Initialize structure in case we error out later to prevent any stack info leakage. */ *retinfo = (struct serial_struct){}; /* * Ensure the state we copy is consistent and no hardware changes * occur as we go */ mutex_lock(&port->mutex); uport = uart_port_check(state); if (!uport) goto out; retinfo->type = uport->type; retinfo->line = uport->line; retinfo->port = uport->iobase; if (HIGH_BITS_OFFSET) retinfo->port_high = (long) uport->iobase >> HIGH_BITS_OFFSET; retinfo->irq = uport->irq; retinfo->flags = (__force int)uport->flags; retinfo->xmit_fifo_size = uport->fifosize; retinfo->baud_base = uport->uartclk / 16; retinfo->close_delay = jiffies_to_msecs(port->close_delay) / 10; retinfo->closing_wait = port->closing_wait == ASYNC_CLOSING_WAIT_NONE ? ASYNC_CLOSING_WAIT_NONE : jiffies_to_msecs(port->closing_wait) / 10; retinfo->custom_divisor = uport->custom_divisor; retinfo->hub6 = uport->hub6; retinfo->io_type = uport->iotype; retinfo->iomem_reg_shift = uport->regshift; retinfo->iomem_base = (void *)(unsigned long)uport->mapbase; ret = 0; out: mutex_unlock(&port->mutex); return ret; } static int uart_get_info_user(struct tty_struct *tty, struct serial_struct *ss) { struct uart_state *state = tty->driver_data; struct tty_port *port = &state->port; return uart_get_info(port, ss) < 0 ? -EIO : 0; } static int uart_set_info(struct tty_struct *tty, struct tty_port *port, struct uart_state *state, struct serial_struct *new_info) { struct uart_port *uport = uart_port_check(state); unsigned long new_port; unsigned int change_irq, change_port, closing_wait; unsigned int old_custom_divisor, close_delay; upf_t old_flags, new_flags; int retval = 0; if (!uport) return -EIO; new_port = new_info->port; if (HIGH_BITS_OFFSET) new_port += (unsigned long) new_info->port_high << HIGH_BITS_OFFSET; new_info->irq = irq_canonicalize(new_info->irq); close_delay = msecs_to_jiffies(new_info->close_delay * 10); closing_wait = new_info->closing_wait == ASYNC_CLOSING_WAIT_NONE ? ASYNC_CLOSING_WAIT_NONE : msecs_to_jiffies(new_info->closing_wait * 10); change_irq = !(uport->flags & UPF_FIXED_PORT) && new_info->irq != uport->irq; /* * Since changing the 'type' of the port changes its resource * allocations, we should treat type changes the same as * IO port changes. */ change_port = !(uport->flags & UPF_FIXED_PORT) && (new_port != uport->iobase || (unsigned long)new_info->iomem_base != uport->mapbase || new_info->hub6 != uport->hub6 || new_info->io_type != uport->iotype || new_info->iomem_reg_shift != uport->regshift || new_info->type != uport->type); old_flags = uport->flags; new_flags = (__force upf_t)new_info->flags; old_custom_divisor = uport->custom_divisor; if (!(uport->flags & UPF_FIXED_PORT)) { unsigned int uartclk = new_info->baud_base * 16; /* check needs to be done here before other settings made */ if (uartclk == 0) { retval = -EINVAL; goto exit; } } if (!capable(CAP_SYS_ADMIN)) { retval = -EPERM; if (change_irq || change_port || (new_info->baud_base != uport->uartclk / 16) || (close_delay != port->close_delay) || (closing_wait != port->closing_wait) || (new_info->xmit_fifo_size && new_info->xmit_fifo_size != uport->fifosize) || (((new_flags ^ old_flags) & ~UPF_USR_MASK) != 0)) goto exit; uport->flags = ((uport->flags & ~UPF_USR_MASK) | (new_flags & UPF_USR_MASK)); uport->custom_divisor = new_info->custom_divisor; goto check_and_exit; } if (change_irq || change_port) { retval = security_locked_down(LOCKDOWN_TIOCSSERIAL); if (retval) goto exit; } /* * Ask the low level driver to verify the settings. */ if (uport->ops->verify_port) retval = uport->ops->verify_port(uport, new_info); if ((new_info->irq >= irq_get_nr_irqs()) || (new_info->irq < 0) || (new_info->baud_base < 9600)) retval = -EINVAL; if (retval) goto exit; if (change_port || change_irq) { retval = -EBUSY; /* * Make sure that we are the sole user of this port. */ if (tty_port_users(port) > 1) goto exit; /* * We need to shutdown the serial port at the old * port/type/irq combination. */ uart_shutdown(tty, state); } if (change_port) { unsigned long old_iobase, old_mapbase; unsigned int old_type, old_iotype, old_hub6, old_shift; old_iobase = uport->iobase; old_mapbase = uport->mapbase; old_type = uport->type; old_hub6 = uport->hub6; old_iotype = uport->iotype; old_shift = uport->regshift; /* * Free and release old regions */ if (old_type != PORT_UNKNOWN && uport->ops->release_port) uport->ops->release_port(uport); uport->iobase = new_port; uport->type = new_info->type; uport->hub6 = new_info->hub6; uport->iotype = new_info->io_type; uport->regshift = new_info->iomem_reg_shift; uport->mapbase = (unsigned long)new_info->iomem_base; /* * Claim and map the new regions */ if (uport->type != PORT_UNKNOWN && uport->ops->request_port) { retval = uport->ops->request_port(uport); } else { /* Always success - Jean II */ retval = 0; } /* * If we fail to request resources for the * new port, try to restore the old settings. */ if (retval) { uport->iobase = old_iobase; uport->type = old_type; uport->hub6 = old_hub6; uport->iotype = old_iotype; uport->regshift = old_shift; uport->mapbase = old_mapbase; if (old_type != PORT_UNKNOWN) { retval = uport->ops->request_port(uport); /* * If we failed to restore the old settings, * we fail like this. */ if (retval) uport->type = PORT_UNKNOWN; /* * We failed anyway. */ retval = -EBUSY; } /* Added to return the correct error -Ram Gupta */ goto exit; } } if (change_irq) uport->irq = new_info->irq; if (!(uport->flags & UPF_FIXED_PORT)) uport->uartclk = new_info->baud_base * 16; uport->flags = (uport->flags & ~UPF_CHANGE_MASK) | (new_flags & UPF_CHANGE_MASK); uport->custom_divisor = new_info->custom_divisor; port->close_delay = close_delay; port->closing_wait = closing_wait; if (new_info->xmit_fifo_size) uport->fifosize = new_info->xmit_fifo_size; check_and_exit: retval = 0; if (uport->type == PORT_UNKNOWN) goto exit; if (tty_port_initialized(port)) { if (((old_flags ^ uport->flags) & UPF_SPD_MASK) || old_custom_divisor != uport->custom_divisor) { /* * If they're setting up a custom divisor or speed, * instead of clearing it, then bitch about it. */ if (uport->flags & UPF_SPD_MASK) { dev_notice_ratelimited(uport->dev, "%s sets custom speed on %s. This is deprecated.\n", current->comm, tty_name(port->tty)); } uart_change_line_settings(tty, state, NULL); } } else { retval = uart_startup(tty, state, true); if (retval == 0) tty_port_set_initialized(port, true); if (retval > 0) retval = 0; } exit: return retval; } static int uart_set_info_user(struct tty_struct *tty, struct serial_struct *ss) { struct uart_state *state = tty->driver_data; struct tty_port *port = &state->port; int retval; down_write(&tty->termios_rwsem); /* * This semaphore protects port->count. It is also * very useful to prevent opens. Also, take the * port configuration semaphore to make sure that a * module insertion/removal doesn't change anything * under us. */ mutex_lock(&port->mutex); retval = uart_set_info(tty, port, state, ss); mutex_unlock(&port->mutex); up_write(&tty->termios_rwsem); return retval; } /** * uart_get_lsr_info - get line status register info * @tty: tty associated with the UART * @state: UART being queried * @value: returned modem value */ static int uart_get_lsr_info(struct tty_struct *tty, struct uart_state *state, unsigned int __user *value) { struct uart_port *uport = uart_port_check(state); unsigned int result; result = uport->ops->tx_empty(uport); /* * If we're about to load something into the transmit * register, we'll pretend the transmitter isn't empty to * avoid a race condition (depending on when the transmit * interrupt happens). */ if (uport->x_char || (!kfifo_is_empty(&state->port.xmit_fifo) && !uart_tx_stopped(uport))) result &= ~TIOCSER_TEMT; return put_user(result, value); } static int uart_tiocmget(struct tty_struct *tty) { struct uart_state *state = tty->driver_data; struct tty_port *port = &state->port; struct uart_port *uport; int result; guard(mutex)(&port->mutex); uport = uart_port_check(state); if (!uport || tty_io_error(tty)) return -EIO; uart_port_lock_irq(uport); result = uport->mctrl; result |= uport->ops->get_mctrl(uport); uart_port_unlock_irq(uport); return result; } static int uart_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct uart_state *state = tty->driver_data; struct tty_port *port = &state->port; struct uart_port *uport; guard(mutex)(&port->mutex); uport = uart_port_check(state); if (!uport || tty_io_error(tty)) return -EIO; uart_update_mctrl(uport, set, clear); return 0; } static int uart_break_ctl(struct tty_struct *tty, int break_state) { struct uart_state *state = tty->driver_data; struct tty_port *port = &state->port; struct uart_port *uport; guard(mutex)(&port->mutex); uport = uart_port_check(state); if (!uport) return -EIO; if (uport->type != PORT_UNKNOWN && uport->ops->break_ctl) uport->ops->break_ctl(uport, break_state); return 0; } static int uart_do_autoconfig(struct tty_struct *tty, struct uart_state *state) { struct tty_port *port = &state->port; struct uart_port *uport; int flags, ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* * Take the per-port semaphore. This prevents count from * changing, and hence any extra opens of the port while * we're auto-configuring. */ scoped_cond_guard(mutex_intr, return -ERESTARTSYS, &port->mutex) { uport = uart_port_check(state); if (!uport) return -EIO; if (tty_port_users(port) != 1) return -EBUSY; uart_shutdown(tty, state); /* * If we already have a port type configured, * we must release its resources. */ if (uport->type != PORT_UNKNOWN && uport->ops->release_port) uport->ops->release_port(uport); flags = UART_CONFIG_TYPE; if (uport->flags & UPF_AUTO_IRQ) flags |= UART_CONFIG_IRQ; /* * This will claim the ports resources if * a port is found. */ uport->ops->config_port(uport, flags); ret = uart_startup(tty, state, true); if (ret < 0) return ret; if (ret > 0) return 0; tty_port_set_initialized(port, true); } return 0; } static void uart_enable_ms(struct uart_port *uport) { /* * Force modem status interrupts on */ if (uport->ops->enable_ms) uport->ops->enable_ms(uport); } /* * Wait for any of the 4 modem inputs (DCD,RI,DSR,CTS) to change * - mask passed in arg for lines of interest * (use |'ed TIOCM_RNG/DSR/CD/CTS for masking) * Caller should use TIOCGICOUNT to see which one it was * * FIXME: This wants extracting into a common all driver implementation * of TIOCMWAIT using tty_port. */ static int uart_wait_modem_status(struct uart_state *state, unsigned long arg) { struct uart_port *uport; struct tty_port *port = &state->port; DECLARE_WAITQUEUE(wait, current); struct uart_icount cprev, cnow; int ret; /* * note the counters on entry */ uport = uart_port_ref(state); if (!uport) return -EIO; uart_port_lock_irq(uport); memcpy(&cprev, &uport->icount, sizeof(struct uart_icount)); uart_enable_ms(uport); uart_port_unlock_irq(uport); add_wait_queue(&port->delta_msr_wait, &wait); for (;;) { uart_port_lock_irq(uport); memcpy(&cnow, &uport->icount, sizeof(struct uart_icount)); uart_port_unlock_irq(uport); set_current_state(TASK_INTERRUPTIBLE); if (((arg & TIOCM_RNG) && (cnow.rng != cprev.rng)) || ((arg & TIOCM_DSR) && (cnow.dsr != cprev.dsr)) || ((arg & TIOCM_CD) && (cnow.dcd != cprev.dcd)) || ((arg & TIOCM_CTS) && (cnow.cts != cprev.cts))) { ret = 0; break; } schedule(); /* see if a signal did it */ if (signal_pending(current)) { ret = -ERESTARTSYS; break; } cprev = cnow; } __set_current_state(TASK_RUNNING); remove_wait_queue(&port->delta_msr_wait, &wait); uart_port_deref(uport); return ret; } /* * Get counter of input serial line interrupts (DCD,RI,DSR,CTS) * Return: write counters to the user passed counter struct * NB: both 1->0 and 0->1 transitions are counted except for * RI where only 0->1 is counted. */ static int uart_get_icount(struct tty_struct *tty, struct serial_icounter_struct *icount) { struct uart_state *state = tty->driver_data; struct uart_icount cnow; struct uart_port *uport; uport = uart_port_ref(state); if (!uport) return -EIO; uart_port_lock_irq(uport); memcpy(&cnow, &uport->icount, sizeof(struct uart_icount)); uart_port_unlock_irq(uport); uart_port_deref(uport); icount->cts = cnow.cts; icount->dsr = cnow.dsr; icount->rng = cnow.rng; icount->dcd = cnow.dcd; icount->rx = cnow.rx; icount->tx = cnow.tx; icount->frame = cnow.frame; icount->overrun = cnow.overrun; icount->parity = cnow.parity; icount->brk = cnow.brk; icount->buf_overrun = cnow.buf_overrun; return 0; } #define SER_RS485_LEGACY_FLAGS (SER_RS485_ENABLED | SER_RS485_RTS_ON_SEND | \ SER_RS485_RTS_AFTER_SEND | SER_RS485_RX_DURING_TX | \ SER_RS485_TERMINATE_BUS) static int uart_check_rs485_flags(struct uart_port *port, struct serial_rs485 *rs485) { u32 flags = rs485->flags; /* Don't return -EINVAL for unsupported legacy flags */ flags &= ~SER_RS485_LEGACY_FLAGS; /* * For any bit outside of the legacy ones that is not supported by * the driver, return -EINVAL. */ if (flags & ~port->rs485_supported.flags) return -EINVAL; /* Asking for address w/o addressing mode? */ if (!(rs485->flags & SER_RS485_ADDRB) && (rs485->flags & (SER_RS485_ADDR_RECV|SER_RS485_ADDR_DEST))) return -EINVAL; /* Address given but not enabled? */ if (!(rs485->flags & SER_RS485_ADDR_RECV) && rs485->addr_recv) return -EINVAL; if (!(rs485->flags & SER_RS485_ADDR_DEST) && rs485->addr_dest) return -EINVAL; return 0; } static void uart_sanitize_serial_rs485_delays(struct uart_port *port, struct serial_rs485 *rs485) { if (!port->rs485_supported.delay_rts_before_send) { if (rs485->delay_rts_before_send) { dev_warn_ratelimited(port->dev, "%s (%d): RTS delay before sending not supported\n", port->name, port->line); } rs485->delay_rts_before_send = 0; } else if (rs485->delay_rts_before_send > RS485_MAX_RTS_DELAY) { rs485->delay_rts_before_send = RS485_MAX_RTS_DELAY; dev_warn_ratelimited(port->dev, "%s (%d): RTS delay before sending clamped to %u ms\n", port->name, port->line, rs485->delay_rts_before_send); } if (!port->rs485_supported.delay_rts_after_send) { if (rs485->delay_rts_after_send) { dev_warn_ratelimited(port->dev, "%s (%d): RTS delay after sending not supported\n", port->name, port->line); } rs485->delay_rts_after_send = 0; } else if (rs485->delay_rts_after_send > RS485_MAX_RTS_DELAY) { rs485->delay_rts_after_send = RS485_MAX_RTS_DELAY; dev_warn_ratelimited(port->dev, "%s (%d): RTS delay after sending clamped to %u ms\n", port->name, port->line, rs485->delay_rts_after_send); } } static void uart_sanitize_serial_rs485(struct uart_port *port, struct serial_rs485 *rs485) { u32 supported_flags = port->rs485_supported.flags; if (!(rs485->flags & SER_RS485_ENABLED)) { memset(rs485, 0, sizeof(*rs485)); return; } /* Clear other RS485 flags but SER_RS485_TERMINATE_BUS and return if enabling RS422 */ if (rs485->flags & SER_RS485_MODE_RS422) { rs485->flags &= (SER_RS485_ENABLED | SER_RS485_MODE_RS422 | SER_RS485_TERMINATE_BUS); return; } rs485->flags &= supported_flags; /* Pick sane settings if the user hasn't */ if (!(rs485->flags & SER_RS485_RTS_ON_SEND) == !(rs485->flags & SER_RS485_RTS_AFTER_SEND)) { if (supported_flags & SER_RS485_RTS_ON_SEND) { rs485->flags |= SER_RS485_RTS_ON_SEND; rs485->flags &= ~SER_RS485_RTS_AFTER_SEND; dev_warn_ratelimited(port->dev, "%s (%d): invalid RTS setting, using RTS_ON_SEND instead\n", port->name, port->line); } else { rs485->flags |= SER_RS485_RTS_AFTER_SEND; rs485->flags &= ~SER_RS485_RTS_ON_SEND; dev_warn_ratelimited(port->dev, "%s (%d): invalid RTS setting, using RTS_AFTER_SEND instead\n", port->name, port->line); } } uart_sanitize_serial_rs485_delays(port, rs485); /* Return clean padding area to userspace */ memset(rs485->padding0, 0, sizeof(rs485->padding0)); memset(rs485->padding1, 0, sizeof(rs485->padding1)); } static void uart_set_rs485_termination(struct uart_port *port, const struct serial_rs485 *rs485) { if (!(rs485->flags & SER_RS485_ENABLED)) return; gpiod_set_value_cansleep(port->rs485_term_gpio, !!(rs485->flags & SER_RS485_TERMINATE_BUS)); } static void uart_set_rs485_rx_during_tx(struct uart_port *port, const struct serial_rs485 *rs485) { if (!(rs485->flags & SER_RS485_ENABLED)) return; gpiod_set_value_cansleep(port->rs485_rx_during_tx_gpio, !!(rs485->flags & SER_RS485_RX_DURING_TX)); } static int uart_rs485_config(struct uart_port *port) { struct serial_rs485 *rs485 = &port->rs485; unsigned long flags; int ret; if (!(rs485->flags & SER_RS485_ENABLED)) return 0; uart_sanitize_serial_rs485(port, rs485); uart_set_rs485_termination(port, rs485); uart_set_rs485_rx_during_tx(port, rs485); uart_port_lock_irqsave(port, &flags); ret = port->rs485_config(port, NULL, rs485); uart_port_unlock_irqrestore(port, flags); if (ret) { memset(rs485, 0, sizeof(*rs485)); /* unset GPIOs */ gpiod_set_value_cansleep(port->rs485_term_gpio, 0); gpiod_set_value_cansleep(port->rs485_rx_during_tx_gpio, 0); } return ret; } static int uart_get_rs485_config(struct uart_port *port, struct serial_rs485 __user *rs485) { unsigned long flags; struct serial_rs485 aux; uart_port_lock_irqsave(port, &flags); aux = port->rs485; uart_port_unlock_irqrestore(port, flags); if (copy_to_user(rs485, &aux, sizeof(aux))) return -EFAULT; return 0; } static int uart_set_rs485_config(struct tty_struct *tty, struct uart_port *port, struct serial_rs485 __user *rs485_user) { struct serial_rs485 rs485; int ret; unsigned long flags; if (!(port->rs485_supported.flags & SER_RS485_ENABLED)) return -ENOTTY; if (copy_from_user(&rs485, rs485_user, sizeof(*rs485_user))) return -EFAULT; ret = uart_check_rs485_flags(port, &rs485); if (ret) return ret; uart_sanitize_serial_rs485(port, &rs485); uart_set_rs485_termination(port, &rs485); uart_set_rs485_rx_during_tx(port, &rs485); uart_port_lock_irqsave(port, &flags); ret = port->rs485_config(port, &tty->termios, &rs485); if (!ret) { port->rs485 = rs485; /* Reset RTS and other mctrl lines when disabling RS485 */ if (!(rs485.flags & SER_RS485_ENABLED)) port->ops->set_mctrl(port, port->mctrl); } uart_port_unlock_irqrestore(port, flags); if (ret) { /* restore old GPIO settings */ gpiod_set_value_cansleep(port->rs485_term_gpio, !!(port->rs485.flags & SER_RS485_TERMINATE_BUS)); gpiod_set_value_cansleep(port->rs485_rx_during_tx_gpio, !!(port->rs485.flags & SER_RS485_RX_DURING_TX)); return ret; } if (copy_to_user(rs485_user, &port->rs485, sizeof(port->rs485))) return -EFAULT; return 0; } static int uart_get_iso7816_config(struct uart_port *port, struct serial_iso7816 __user *iso7816) { unsigned long flags; struct serial_iso7816 aux; if (!port->iso7816_config) return -ENOTTY; uart_port_lock_irqsave(port, &flags); aux = port->iso7816; uart_port_unlock_irqrestore(port, flags); if (copy_to_user(iso7816, &aux, sizeof(aux))) return -EFAULT; return 0; } static int uart_set_iso7816_config(struct uart_port *port, struct serial_iso7816 __user *iso7816_user) { struct serial_iso7816 iso7816; int i, ret; unsigned long flags; if (!port->iso7816_config) return -ENOTTY; if (copy_from_user(&iso7816, iso7816_user, sizeof(*iso7816_user))) return -EFAULT; /* * There are 5 words reserved for future use. Check that userspace * doesn't put stuff in there to prevent breakages in the future. */ for (i = 0; i < ARRAY_SIZE(iso7816.reserved); i++) if (iso7816.reserved[i]) return -EINVAL; uart_port_lock_irqsave(port, &flags); ret = port->iso7816_config(port, &iso7816); uart_port_unlock_irqrestore(port, flags); if (ret) return ret; if (copy_to_user(iso7816_user, &port->iso7816, sizeof(port->iso7816))) return -EFAULT; return 0; } /* * Called via sys_ioctl. We can use spin_lock_irq() here. */ static int uart_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct uart_state *state = tty->driver_data; struct tty_port *port = &state->port; struct uart_port *uport; void __user *uarg = (void __user *)arg; int ret = -ENOIOCTLCMD; /* * These ioctls don't rely on the hardware to be present. */ switch (cmd) { case TIOCSERCONFIG: down_write(&tty->termios_rwsem); ret = uart_do_autoconfig(tty, state); up_write(&tty->termios_rwsem); break; } if (ret != -ENOIOCTLCMD) goto out; if (tty_io_error(tty)) { ret = -EIO; goto out; } /* * The following should only be used when hardware is present. */ switch (cmd) { case TIOCMIWAIT: ret = uart_wait_modem_status(state, arg); break; } if (ret != -ENOIOCTLCMD) goto out; /* rs485_config requires more locking than others */ if (cmd == TIOCSRS485) down_write(&tty->termios_rwsem); mutex_lock(&port->mutex); uport = uart_port_check(state); if (!uport || tty_io_error(tty)) { ret = -EIO; goto out_up; } /* * All these rely on hardware being present and need to be * protected against the tty being hung up. */ switch (cmd) { case TIOCSERGETLSR: /* Get line status register */ ret = uart_get_lsr_info(tty, state, uarg); break; case TIOCGRS485: ret = uart_get_rs485_config(uport, uarg); break; case TIOCSRS485: ret = uart_set_rs485_config(tty, uport, uarg); break; case TIOCSISO7816: ret = uart_set_iso7816_config(state->uart_port, uarg); break; case TIOCGISO7816: ret = uart_get_iso7816_config(state->uart_port, uarg); break; default: if (uport->ops->ioctl) ret = uport->ops->ioctl(uport, cmd, arg); break; } out_up: mutex_unlock(&port->mutex); if (cmd == TIOCSRS485) up_write(&tty->termios_rwsem); out: return ret; } static void uart_set_ldisc(struct tty_struct *tty) { struct uart_state *state = tty->driver_data; struct uart_port *uport; struct tty_port *port = &state->port; if (!tty_port_initialized(port)) return; mutex_lock(&state->port.mutex); uport = uart_port_check(state); if (uport && uport->ops->set_ldisc) uport->ops->set_ldisc(uport, &tty->termios); mutex_unlock(&state->port.mutex); } static void uart_set_termios(struct tty_struct *tty, const struct ktermios *old_termios) { struct uart_state *state = tty->driver_data; struct uart_port *uport; unsigned int cflag = tty->termios.c_cflag; unsigned int iflag_mask = IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK; bool sw_changed = false; guard(mutex)(&state->port.mutex); uport = uart_port_check(state); if (!uport) return; /* * Drivers doing software flow control also need to know * about changes to these input settings. */ if (uport->flags & UPF_SOFT_FLOW) { iflag_mask |= IXANY|IXON|IXOFF; sw_changed = tty->termios.c_cc[VSTART] != old_termios->c_cc[VSTART] || tty->termios.c_cc[VSTOP] != old_termios->c_cc[VSTOP]; } /* * These are the bits that are used to setup various * flags in the low level driver. We can ignore the Bfoo * bits in c_cflag; c_[io]speed will always be set * appropriately by set_termios() in tty_ioctl.c */ if ((cflag ^ old_termios->c_cflag) == 0 && tty->termios.c_ospeed == old_termios->c_ospeed && tty->termios.c_ispeed == old_termios->c_ispeed && ((tty->termios.c_iflag ^ old_termios->c_iflag) & iflag_mask) == 0 && !sw_changed) return; uart_change_line_settings(tty, state, old_termios); /* reload cflag from termios; port driver may have overridden flags */ cflag = tty->termios.c_cflag; /* Handle transition to B0 status */ if (((old_termios->c_cflag & CBAUD) != B0) && ((cflag & CBAUD) == B0)) uart_clear_mctrl(uport, TIOCM_RTS | TIOCM_DTR); /* Handle transition away from B0 status */ else if (((old_termios->c_cflag & CBAUD) == B0) && ((cflag & CBAUD) != B0)) { unsigned int mask = TIOCM_DTR; if (!(cflag & CRTSCTS) || !tty_throttled(tty)) mask |= TIOCM_RTS; uart_set_mctrl(uport, mask); } } /* * Calls to uart_close() are serialised via the tty_lock in * drivers/tty/tty_io.c:tty_release() * drivers/tty/tty_io.c:do_tty_hangup() */ static void uart_close(struct tty_struct *tty, struct file *filp) { struct uart_state *state = tty->driver_data; if (!state) { struct uart_driver *drv = tty->driver->driver_state; struct tty_port *port; state = drv->state + tty->index; port = &state->port; spin_lock_irq(&port->lock); --port->count; spin_unlock_irq(&port->lock); return; } pr_debug("uart_close(%d) called\n", tty->index); tty_port_close(tty->port, tty, filp); } static void uart_tty_port_shutdown(struct tty_port *port) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport = uart_port_check(state); /* * At this point, we stop accepting input. To do this, we * disable the receive line status interrupts. */ if (WARN(!uport, "detached port still initialized!\n")) return; uart_port_lock_irq(uport); uport->ops->stop_rx(uport); uart_port_unlock_irq(uport); serial_base_port_shutdown(uport); uart_port_shutdown(port); /* * It's possible for shutdown to be called after suspend if we get * a DCD drop (hangup) at just the right time. Clear suspended bit so * we don't try to resume a port that has been shutdown. */ tty_port_set_suspended(port, false); uart_free_xmit_buf(port); uart_change_pm(state, UART_PM_STATE_OFF); } static void uart_wait_until_sent(struct tty_struct *tty, int timeout) { struct uart_state *state = tty->driver_data; struct uart_port *port; unsigned long char_time, expire, fifo_timeout; port = uart_port_ref(state); if (!port) return; if (port->type == PORT_UNKNOWN || port->fifosize == 0) { uart_port_deref(port); return; } /* * Set the check interval to be 1/5 of the estimated time to * send a single character, and make it at least 1. The check * interval should also be less than the timeout. * * Note: we have to use pretty tight timings here to satisfy * the NIST-PCTS. */ char_time = max(nsecs_to_jiffies(port->frame_time / 5), 1UL); if (timeout && timeout < char_time) char_time = timeout; if (!uart_cts_enabled(port)) { /* * If the transmitter hasn't cleared in twice the approximate * amount of time to send the entire FIFO, it probably won't * ever clear. This assumes the UART isn't doing flow * control, which is currently the case. Hence, if it ever * takes longer than FIFO timeout, this is probably due to a * UART bug of some kind. So, we clamp the timeout parameter at * 2 * FIFO timeout. */ fifo_timeout = uart_fifo_timeout(port); if (timeout == 0 || timeout > 2 * fifo_timeout) timeout = 2 * fifo_timeout; } expire = jiffies + timeout; pr_debug("uart_wait_until_sent(%d), jiffies=%lu, expire=%lu...\n", port->line, jiffies, expire); /* * Check whether the transmitter is empty every 'char_time'. * 'timeout' / 'expire' give us the maximum amount of time * we wait. */ while (!port->ops->tx_empty(port)) { msleep_interruptible(jiffies_to_msecs(char_time)); if (signal_pending(current)) break; if (timeout && time_after(jiffies, expire)) break; } uart_port_deref(port); } /* * Calls to uart_hangup() are serialised by the tty_lock in * drivers/tty/tty_io.c:do_tty_hangup() * This runs from a workqueue and can sleep for a _short_ time only. */ static void uart_hangup(struct tty_struct *tty) { struct uart_state *state = tty->driver_data; struct tty_port *port = &state->port; struct uart_port *uport; unsigned long flags; pr_debug("uart_hangup(%d)\n", tty->index); mutex_lock(&port->mutex); uport = uart_port_check(state); WARN(!uport, "hangup of detached port!\n"); if (tty_port_active(port)) { uart_flush_buffer(tty); uart_shutdown(tty, state); spin_lock_irqsave(&port->lock, flags); port->count = 0; spin_unlock_irqrestore(&port->lock, flags); tty_port_set_active(port, false); tty_port_tty_set(port, NULL); if (uport && !uart_console(uport)) uart_change_pm(state, UART_PM_STATE_OFF); wake_up_interruptible(&port->open_wait); wake_up_interruptible(&port->delta_msr_wait); } mutex_unlock(&port->mutex); } /* uport == NULL if uart_port has already been removed */ static void uart_port_shutdown(struct tty_port *port) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport = uart_port_check(state); /* * clear delta_msr_wait queue to avoid mem leaks: we may free * the irq here so the queue might never be woken up. Note * that we won't end up waiting on delta_msr_wait again since * any outstanding file descriptors should be pointing at * hung_up_tty_fops now. */ wake_up_interruptible(&port->delta_msr_wait); if (uport) { /* Free the IRQ and disable the port. */ uport->ops->shutdown(uport); /* Ensure that the IRQ handler isn't running on another CPU. */ synchronize_irq(uport->irq); } } static bool uart_carrier_raised(struct tty_port *port) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport; int mctrl; uport = uart_port_ref(state); /* * Should never observe uport == NULL since checks for hangup should * abort the tty_port_block_til_ready() loop before checking for carrier * raised -- but report carrier raised if it does anyway so open will * continue and not sleep */ if (WARN_ON(!uport)) return true; uart_port_lock_irq(uport); uart_enable_ms(uport); mctrl = uport->ops->get_mctrl(uport); uart_port_unlock_irq(uport); uart_port_deref(uport); return mctrl & TIOCM_CAR; } static void uart_dtr_rts(struct tty_port *port, bool active) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport; uport = uart_port_ref(state); if (!uport) return; uart_port_dtr_rts(uport, active); uart_port_deref(uport); } static int uart_install(struct tty_driver *driver, struct tty_struct *tty) { struct uart_driver *drv = driver->driver_state; struct uart_state *state = drv->state + tty->index; tty->driver_data = state; return tty_standard_install(driver, tty); } /* * Calls to uart_open are serialised by the tty_lock in * drivers/tty/tty_io.c:tty_open() * Note that if this fails, then uart_close() _will_ be called. * * In time, we want to scrap the "opening nonpresent ports" * behaviour and implement an alternative way for setserial * to set base addresses/ports/types. This will allow us to * get rid of a certain amount of extra tests. */ static int uart_open(struct tty_struct *tty, struct file *filp) { struct uart_state *state = tty->driver_data; int retval; retval = tty_port_open(&state->port, tty, filp); if (retval > 0) retval = 0; return retval; } static int uart_port_activate(struct tty_port *port, struct tty_struct *tty) { struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport; int ret; uport = uart_port_check(state); if (!uport || uport->flags & UPF_DEAD) return -ENXIO; /* * Start up the serial port. */ ret = uart_startup(tty, state, false); if (ret > 0) tty_port_set_active(port, true); return ret; } static const char *uart_type(struct uart_port *port) { const char *str = NULL; if (port->ops->type) str = port->ops->type(port); if (!str) str = "unknown"; return str; } #ifdef CONFIG_PROC_FS static void uart_line_info(struct seq_file *m, struct uart_driver *drv, int i) { struct uart_state *state = drv->state + i; struct tty_port *port = &state->port; enum uart_pm_state pm_state; struct uart_port *uport; char stat_buf[32]; unsigned int status; int mmio; guard(mutex)(&port->mutex); uport = uart_port_check(state); if (!uport) return; mmio = uport->iotype >= UPIO_MEM; seq_printf(m, "%d: uart:%s %s%08llX irq:%d", uport->line, uart_type(uport), mmio ? "mmio:0x" : "port:", mmio ? (unsigned long long)uport->mapbase : (unsigned long long)uport->iobase, uport->irq); if (uport->type == PORT_UNKNOWN) { seq_putc(m, '\n'); return; } if (capable(CAP_SYS_ADMIN)) { pm_state = state->pm_state; if (pm_state != UART_PM_STATE_ON) uart_change_pm(state, UART_PM_STATE_ON); uart_port_lock_irq(uport); status = uport->ops->get_mctrl(uport); uart_port_unlock_irq(uport); if (pm_state != UART_PM_STATE_ON) uart_change_pm(state, pm_state); seq_printf(m, " tx:%d rx:%d", uport->icount.tx, uport->icount.rx); if (uport->icount.frame) seq_printf(m, " fe:%d", uport->icount.frame); if (uport->icount.parity) seq_printf(m, " pe:%d", uport->icount.parity); if (uport->icount.brk) seq_printf(m, " brk:%d", uport->icount.brk); if (uport->icount.overrun) seq_printf(m, " oe:%d", uport->icount.overrun); if (uport->icount.buf_overrun) seq_printf(m, " bo:%d", uport->icount.buf_overrun); #define INFOBIT(bit, str) \ if (uport->mctrl & (bit)) \ strncat(stat_buf, (str), sizeof(stat_buf) - \ strlen(stat_buf) - 2) #define STATBIT(bit, str) \ if (status & (bit)) \ strncat(stat_buf, (str), sizeof(stat_buf) - \ strlen(stat_buf) - 2) stat_buf[0] = '\0'; stat_buf[1] = '\0'; INFOBIT(TIOCM_RTS, "|RTS"); STATBIT(TIOCM_CTS, "|CTS"); INFOBIT(TIOCM_DTR, "|DTR"); STATBIT(TIOCM_DSR, "|DSR"); STATBIT(TIOCM_CAR, "|CD"); STATBIT(TIOCM_RNG, "|RI"); if (stat_buf[0]) stat_buf[0] = ' '; seq_puts(m, stat_buf); } seq_putc(m, '\n'); #undef STATBIT #undef INFOBIT } static int uart_proc_show(struct seq_file *m, void *v) { struct tty_driver *ttydrv = m->private; struct uart_driver *drv = ttydrv->driver_state; int i; seq_printf(m, "serinfo:1.0 driver%s%s revision:%s\n", "", "", ""); for (i = 0; i < drv->nr; i++) uart_line_info(m, drv, i); return 0; } #endif static void uart_port_spin_lock_init(struct uart_port *port) { spin_lock_init(&port->lock); lockdep_set_class(&port->lock, &port_lock_key); } #if defined(CONFIG_SERIAL_CORE_CONSOLE) || defined(CONFIG_CONSOLE_POLL) /** * uart_console_write - write a console message to a serial port * @port: the port to write the message * @s: array of characters * @count: number of characters in string to write * @putchar: function to write character to port */ void uart_console_write(struct uart_port *port, const char *s, unsigned int count, void (*putchar)(struct uart_port *, unsigned char)) { unsigned int i; for (i = 0; i < count; i++, s++) { if (*s == '\n') putchar(port, '\r'); putchar(port, *s); } } EXPORT_SYMBOL_GPL(uart_console_write); /** * uart_get_console - get uart port for console * @ports: ports to search in * @nr: number of @ports * @co: console to search for * Returns: uart_port for the console @co * * Check whether an invalid uart number has been specified (as @co->index), and * if so, search for the first available port that does have console support. */ struct uart_port * __init uart_get_console(struct uart_port *ports, int nr, struct console *co) { int idx = co->index; if (idx < 0 || idx >= nr || (ports[idx].iobase == 0 && ports[idx].membase == NULL)) for (idx = 0; idx < nr; idx++) if (ports[idx].iobase != 0 || ports[idx].membase != NULL) break; co->index = idx; return ports + idx; } /** * uart_parse_earlycon - Parse earlycon options * @p: ptr to 2nd field (ie., just beyond '<name>,') * @iotype: ptr for decoded iotype (out) * @addr: ptr for decoded mapbase/iobase (out) * @options: ptr for <options> field; %NULL if not present (out) * * Decodes earlycon kernel command line parameters of the form: * * earlycon=<name>,io|mmio|mmio16|mmio32|mmio32be|mmio32native,<addr>,<options> * * console=<name>,io|mmio|mmio16|mmio32|mmio32be|mmio32native,<addr>,<options> * * The optional form: * * earlycon=<name>,0x<addr>,<options> * * console=<name>,0x<addr>,<options> * * is also accepted; the returned @iotype will be %UPIO_MEM. * * Returns: 0 on success or -%EINVAL on failure */ int uart_parse_earlycon(char *p, unsigned char *iotype, resource_size_t *addr, char **options) { if (strncmp(p, "mmio,", 5) == 0) { *iotype = UPIO_MEM; p += 5; } else if (strncmp(p, "mmio16,", 7) == 0) { *iotype = UPIO_MEM16; p += 7; } else if (strncmp(p, "mmio32,", 7) == 0) { *iotype = UPIO_MEM32; p += 7; } else if (strncmp(p, "mmio32be,", 9) == 0) { *iotype = UPIO_MEM32BE; p += 9; } else if (strncmp(p, "mmio32native,", 13) == 0) { *iotype = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) ? UPIO_MEM32BE : UPIO_MEM32; p += 13; } else if (strncmp(p, "io,", 3) == 0) { *iotype = UPIO_PORT; p += 3; } else if (strncmp(p, "0x", 2) == 0) { *iotype = UPIO_MEM; } else { return -EINVAL; } /* * Before you replace it with kstrtoull(), think about options separator * (',') it will not tolerate */ *addr = simple_strtoull(p, NULL, 0); p = strchr(p, ','); if (p) p++; *options = p; return 0; } EXPORT_SYMBOL_GPL(uart_parse_earlycon); /** * uart_parse_options - Parse serial port baud/parity/bits/flow control. * @options: pointer to option string * @baud: pointer to an 'int' variable for the baud rate. * @parity: pointer to an 'int' variable for the parity. * @bits: pointer to an 'int' variable for the number of data bits. * @flow: pointer to an 'int' variable for the flow control character. * * uart_parse_options() decodes a string containing the serial console * options. The format of the string is <baud><parity><bits><flow>, * eg: 115200n8r */ void uart_parse_options(const char *options, int *baud, int *parity, int *bits, int *flow) { const char *s = options; *baud = simple_strtoul(s, NULL, 10); while (*s >= '0' && *s <= '9') s++; if (*s) *parity = *s++; if (*s) *bits = *s++ - '0'; if (*s) *flow = *s; } EXPORT_SYMBOL_GPL(uart_parse_options); /** * uart_set_options - setup the serial console parameters * @port: pointer to the serial ports uart_port structure * @co: console pointer * @baud: baud rate * @parity: parity character - 'n' (none), 'o' (odd), 'e' (even) * @bits: number of data bits * @flow: flow control character - 'r' (rts) * * Locking: Caller must hold console_list_lock in order to serialize * early initialization of the serial-console lock. */ int uart_set_options(struct uart_port *port, struct console *co, int baud, int parity, int bits, int flow) { struct ktermios termios; static struct ktermios dummy; /* * Ensure that the serial-console lock is initialised early. * * Note that the console-registered check is needed because * kgdboc can call uart_set_options() for an already registered * console via tty_find_polling_driver() and uart_poll_init(). */ if (!uart_console_registered_locked(port) && !port->console_reinit) uart_port_spin_lock_init(port); memset(&termios, 0, sizeof(struct ktermios)); termios.c_cflag |= CREAD | HUPCL | CLOCAL; tty_termios_encode_baud_rate(&termios, baud, baud); if (bits == 7) termios.c_cflag |= CS7; else termios.c_cflag |= CS8; switch (parity) { case 'o': case 'O': termios.c_cflag |= PARODD; fallthrough; case 'e': case 'E': termios.c_cflag |= PARENB; break; } if (flow == 'r') termios.c_cflag |= CRTSCTS; /* * some uarts on other side don't support no flow control. * So we set * DTR in host uart to make them happy */ port->mctrl |= TIOCM_DTR; port->ops->set_termios(port, &termios, &dummy); /* * Allow the setting of the UART parameters with a NULL console * too: */ if (co) { co->cflag = termios.c_cflag; co->ispeed = termios.c_ispeed; co->ospeed = termios.c_ospeed; } return 0; } EXPORT_SYMBOL_GPL(uart_set_options); #endif /* CONFIG_SERIAL_CORE_CONSOLE */ /** * uart_change_pm - set power state of the port * * @state: port descriptor * @pm_state: new state * * Locking: port->mutex has to be held */ static void uart_change_pm(struct uart_state *state, enum uart_pm_state pm_state) { struct uart_port *port = uart_port_check(state); if (state->pm_state != pm_state) { if (port && port->ops->pm) port->ops->pm(port, pm_state, state->pm_state); state->pm_state = pm_state; } } struct uart_match { struct uart_port *port; struct uart_driver *driver; }; static int serial_match_port(struct device *dev, void *data) { struct uart_match *match = data; struct tty_driver *tty_drv = match->driver->tty_driver; dev_t devt = MKDEV(tty_drv->major, tty_drv->minor_start) + match->port->line; return dev->devt == devt; /* Actually, only one tty per port */ } int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport) { struct uart_state *state = drv->state + uport->line; struct tty_port *port = &state->port; struct device *tty_dev; struct uart_match match = {uport, drv}; guard(mutex)(&port->mutex); tty_dev = device_find_child(&uport->port_dev->dev, &match, serial_match_port); if (tty_dev && device_may_wakeup(tty_dev)) { enable_irq_wake(uport->irq); put_device(tty_dev); return 0; } put_device(tty_dev); /* * Nothing to do if the console is not suspending * except stop_rx to prevent any asynchronous data * over RX line. However ensure that we will be * able to Re-start_rx later. */ if (!console_suspend_enabled && uart_console(uport)) { if (uport->ops->start_rx) { uart_port_lock_irq(uport); uport->ops->stop_rx(uport); uart_port_unlock_irq(uport); } device_set_awake_path(uport->dev); return 0; } uport->suspended = 1; if (tty_port_initialized(port)) { const struct uart_ops *ops = uport->ops; int tries; unsigned int mctrl; tty_port_set_suspended(port, true); tty_port_set_initialized(port, false); uart_port_lock_irq(uport); ops->stop_tx(uport); if (!(uport->rs485.flags & SER_RS485_ENABLED)) ops->set_mctrl(uport, 0); /* save mctrl so it can be restored on resume */ mctrl = uport->mctrl; uport->mctrl = 0; ops->stop_rx(uport); uart_port_unlock_irq(uport); /* * Wait for the transmitter to empty. */ for (tries = 3; !ops->tx_empty(uport) && tries; tries--) msleep(10); if (!tries) dev_err(uport->dev, "%s: Unable to drain transmitter\n", uport->name); ops->shutdown(uport); uport->mctrl = mctrl; } /* * Disable the console device before suspending. */ if (uart_console(uport)) console_stop(uport->cons); uart_change_pm(state, UART_PM_STATE_OFF); return 0; } EXPORT_SYMBOL(uart_suspend_port); int uart_resume_port(struct uart_driver *drv, struct uart_port *uport) { struct uart_state *state = drv->state + uport->line; struct tty_port *port = &state->port; struct device *tty_dev; struct uart_match match = {uport, drv}; struct ktermios termios; guard(mutex)(&port->mutex); tty_dev = device_find_child(&uport->port_dev->dev, &match, serial_match_port); if (!uport->suspended && device_may_wakeup(tty_dev)) { if (irqd_is_wakeup_set(irq_get_irq_data((uport->irq)))) disable_irq_wake(uport->irq); put_device(tty_dev); return 0; } put_device(tty_dev); uport->suspended = 0; /* * Re-enable the console device after suspending. */ if (uart_console(uport)) { /* * First try to use the console cflag setting. */ memset(&termios, 0, sizeof(struct ktermios)); termios.c_cflag = uport->cons->cflag; termios.c_ispeed = uport->cons->ispeed; termios.c_ospeed = uport->cons->ospeed; /* * If that's unset, use the tty termios setting. */ if (port->tty && termios.c_cflag == 0) termios = port->tty->termios; if (console_suspend_enabled) uart_change_pm(state, UART_PM_STATE_ON); uport->ops->set_termios(uport, &termios, NULL); if (!console_suspend_enabled && uport->ops->start_rx) { uart_port_lock_irq(uport); uport->ops->start_rx(uport); uart_port_unlock_irq(uport); } if (console_suspend_enabled) console_start(uport->cons); } if (tty_port_suspended(port)) { const struct uart_ops *ops = uport->ops; int ret; uart_change_pm(state, UART_PM_STATE_ON); uart_port_lock_irq(uport); if (!(uport->rs485.flags & SER_RS485_ENABLED)) ops->set_mctrl(uport, 0); uart_port_unlock_irq(uport); if (console_suspend_enabled || !uart_console(uport)) { /* Protected by port mutex for now */ struct tty_struct *tty = port->tty; ret = ops->startup(uport); if (ret == 0) { if (tty) uart_change_line_settings(tty, state, NULL); uart_rs485_config(uport); uart_port_lock_irq(uport); if (!(uport->rs485.flags & SER_RS485_ENABLED)) ops->set_mctrl(uport, uport->mctrl); ops->start_tx(uport); uart_port_unlock_irq(uport); tty_port_set_initialized(port, true); } else { /* * Failed to resume - maybe hardware went away? * Clear the "initialized" flag so we won't try * to call the low level drivers shutdown method. */ uart_shutdown(tty, state); } } tty_port_set_suspended(port, false); } return 0; } EXPORT_SYMBOL(uart_resume_port); static inline void uart_report_port(struct uart_driver *drv, struct uart_port *port) { char address[64]; switch (port->iotype) { case UPIO_PORT: snprintf(address, sizeof(address), "I/O 0x%lx", port->iobase); break; case UPIO_HUB6: snprintf(address, sizeof(address), "I/O 0x%lx offset 0x%x", port->iobase, port->hub6); break; case UPIO_MEM: case UPIO_MEM16: case UPIO_MEM32: case UPIO_MEM32BE: case UPIO_AU: case UPIO_TSI: snprintf(address, sizeof(address), "MMIO 0x%llx", (unsigned long long)port->mapbase); break; default: strscpy(address, "*unknown*", sizeof(address)); break; } pr_info("%s%s%s at %s (irq = %d, base_baud = %d) is a %s\n", port->dev ? dev_name(port->dev) : "", port->dev ? ": " : "", port->name, address, port->irq, port->uartclk / 16, uart_type(port)); /* The magic multiplier feature is a bit obscure, so report it too. */ if (port->flags & UPF_MAGIC_MULTIPLIER) pr_info("%s%s%s extra baud rates supported: %d, %d", port->dev ? dev_name(port->dev) : "", port->dev ? ": " : "", port->name, port->uartclk / 8, port->uartclk / 4); } static void uart_configure_port(struct uart_driver *drv, struct uart_state *state, struct uart_port *port) { unsigned int flags; /* * If there isn't a port here, don't do anything further. */ if (!port->iobase && !port->mapbase && !port->membase) return; /* * Now do the auto configuration stuff. Note that config_port * is expected to claim the resources and map the port for us. */ flags = 0; if (port->flags & UPF_AUTO_IRQ) flags |= UART_CONFIG_IRQ; if (port->flags & UPF_BOOT_AUTOCONF) { if (!(port->flags & UPF_FIXED_TYPE)) { port->type = PORT_UNKNOWN; flags |= UART_CONFIG_TYPE; } /* Synchronize with possible boot console. */ if (uart_console(port)) console_lock(); port->ops->config_port(port, flags); if (uart_console(port)) console_unlock(); } if (port->type != PORT_UNKNOWN) { unsigned long flags; uart_report_port(drv, port); /* Synchronize with possible boot console. */ if (uart_console(port)) console_lock(); /* Power up port for set_mctrl() */ uart_change_pm(state, UART_PM_STATE_ON); /* * Ensure that the modem control lines are de-activated. * keep the DTR setting that is set in uart_set_options() * We probably don't need a spinlock around this, but */ uart_port_lock_irqsave(port, &flags); port->mctrl &= TIOCM_DTR; if (!(port->rs485.flags & SER_RS485_ENABLED)) port->ops->set_mctrl(port, port->mctrl); uart_port_unlock_irqrestore(port, flags); uart_rs485_config(port); if (uart_console(port)) console_unlock(); /* * If this driver supports console, and it hasn't been * successfully registered yet, try to re-register it. * It may be that the port was not available. */ if (port->cons && !console_is_registered(port->cons)) register_console(port->cons); /* * Power down all ports by default, except the * console if we have one. */ if (!uart_console(port)) uart_change_pm(state, UART_PM_STATE_OFF); } } #ifdef CONFIG_CONSOLE_POLL static int uart_poll_init(struct tty_driver *driver, int line, char *options) { struct uart_driver *drv = driver->driver_state; struct uart_state *state = drv->state + line; enum uart_pm_state pm_state; struct tty_port *tport; struct uart_port *port; int baud = 9600; int bits = 8; int parity = 'n'; int flow = 'n'; int ret = 0; tport = &state->port; guard(mutex)(&tport->mutex); port = uart_port_check(state); if (!port || port->type == PORT_UNKNOWN || !(port->ops->poll_get_char && port->ops->poll_put_char)) return -1; pm_state = state->pm_state; uart_change_pm(state, UART_PM_STATE_ON); if (port->ops->poll_init) { /* * We don't set initialized as we only initialized the hw, * e.g. state->xmit is still uninitialized. */ if (!tty_port_initialized(tport)) ret = port->ops->poll_init(port); } if (!ret && options) { uart_parse_options(options, &baud, &parity, &bits, &flow); console_list_lock(); ret = uart_set_options(port, NULL, baud, parity, bits, flow); console_list_unlock(); } if (ret) uart_change_pm(state, pm_state); return ret; } static int uart_poll_get_char(struct tty_driver *driver, int line) { struct uart_driver *drv = driver->driver_state; struct uart_state *state = drv->state + line; struct uart_port *port; int ret = -1; port = uart_port_ref(state); if (port) { ret = port->ops->poll_get_char(port); uart_port_deref(port); } return ret; } static void uart_poll_put_char(struct tty_driver *driver, int line, char ch) { struct uart_driver *drv = driver->driver_state; struct uart_state *state = drv->state + line; struct uart_port *port; port = uart_port_ref(state); if (!port) return; if (ch == '\n') port->ops->poll_put_char(port, '\r'); port->ops->poll_put_char(port, ch); uart_port_deref(port); } #endif static const struct tty_operations uart_ops = { .install = uart_install, .open = uart_open, .close = uart_close, .write = uart_write, .put_char = uart_put_char, .flush_chars = uart_flush_chars, .write_room = uart_write_room, .chars_in_buffer= uart_chars_in_buffer, .flush_buffer = uart_flush_buffer, .ioctl = uart_ioctl, .throttle = uart_throttle, .unthrottle = uart_unthrottle, .send_xchar = uart_send_xchar, .set_termios = uart_set_termios, .set_ldisc = uart_set_ldisc, .stop = uart_stop, .start = uart_start, .hangup = uart_hangup, .break_ctl = uart_break_ctl, .wait_until_sent= uart_wait_until_sent, #ifdef CONFIG_PROC_FS .proc_show = uart_proc_show, #endif .tiocmget = uart_tiocmget, .tiocmset = uart_tiocmset, .set_serial = uart_set_info_user, .get_serial = uart_get_info_user, .get_icount = uart_get_icount, #ifdef CONFIG_CONSOLE_POLL .poll_init = uart_poll_init, .poll_get_char = uart_poll_get_char, .poll_put_char = uart_poll_put_char, #endif }; static const struct tty_port_operations uart_port_ops = { .carrier_raised = uart_carrier_raised, .dtr_rts = uart_dtr_rts, .activate = uart_port_activate, .shutdown = uart_tty_port_shutdown, }; /** * uart_register_driver - register a driver with the uart core layer * @drv: low level driver structure * * Register a uart driver with the core driver. We in turn register with the * tty layer, and initialise the core driver per-port state. * * We have a proc file in /proc/tty/driver which is named after the normal * driver. * * @drv->port should be %NULL, and the per-port structures should be registered * using uart_add_one_port() after this call has succeeded. * * Locking: none, Interrupts: enabled */ int uart_register_driver(struct uart_driver *drv) { struct tty_driver *normal; int i, retval = -ENOMEM; BUG_ON(drv->state); /* * Maybe we should be using a slab cache for this, especially if * we have a large number of ports to handle. */ drv->state = kcalloc(drv->nr, sizeof(struct uart_state), GFP_KERNEL); if (!drv->state) goto out; normal = tty_alloc_driver(drv->nr, TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV); if (IS_ERR(normal)) { retval = PTR_ERR(normal); goto out_kfree; } drv->tty_driver = normal; normal->driver_name = drv->driver_name; normal->name = drv->dev_name; normal->major = drv->major; normal->minor_start = drv->minor; normal->type = TTY_DRIVER_TYPE_SERIAL; normal->subtype = SERIAL_TYPE_NORMAL; normal->init_termios = tty_std_termios; normal->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; normal->init_termios.c_ispeed = normal->init_termios.c_ospeed = 9600; normal->driver_state = drv; tty_set_operations(normal, &uart_ops); /* * Initialise the UART state(s). */ for (i = 0; i < drv->nr; i++) { struct uart_state *state = drv->state + i; struct tty_port *port = &state->port; tty_port_init(port); port->ops = &uart_port_ops; } retval = tty_register_driver(normal); if (retval >= 0) return retval; for (i = 0; i < drv->nr; i++) tty_port_destroy(&drv->state[i].port); tty_driver_kref_put(normal); out_kfree: kfree(drv->state); out: return retval; } EXPORT_SYMBOL(uart_register_driver); /** * uart_unregister_driver - remove a driver from the uart core layer * @drv: low level driver structure * * Remove all references to a driver from the core driver. The low level * driver must have removed all its ports via the uart_remove_one_port() if it * registered them with uart_add_one_port(). (I.e. @drv->port is %NULL.) * * Locking: none, Interrupts: enabled */ void uart_unregister_driver(struct uart_driver *drv) { struct tty_driver *p = drv->tty_driver; unsigned int i; tty_unregister_driver(p); tty_driver_kref_put(p); for (i = 0; i < drv->nr; i++) tty_port_destroy(&drv->state[i].port); kfree(drv->state); drv->state = NULL; drv->tty_driver = NULL; } EXPORT_SYMBOL(uart_unregister_driver); struct tty_driver *uart_console_device(struct console *co, int *index) { struct uart_driver *p = co->data; *index = co->index; return p->tty_driver; } EXPORT_SYMBOL_GPL(uart_console_device); static ssize_t uartclk_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); uart_get_info(port, &tmp); return sprintf(buf, "%d\n", tmp.baud_base * 16); } static ssize_t type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); uart_get_info(port, &tmp); return sprintf(buf, "%d\n", tmp.type); } static ssize_t line_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); uart_get_info(port, &tmp); return sprintf(buf, "%d\n", tmp.line); } static ssize_t port_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); unsigned long ioaddr; uart_get_info(port, &tmp); ioaddr = tmp.port; if (HIGH_BITS_OFFSET) ioaddr |= (unsigned long)tmp.port_high << HIGH_BITS_OFFSET; return sprintf(buf, "0x%lX\n", ioaddr); } static ssize_t irq_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); uart_get_info(port, &tmp); return sprintf(buf, "%d\n", tmp.irq); } static ssize_t flags_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); uart_get_info(port, &tmp); return sprintf(buf, "0x%X\n", tmp.flags); } static ssize_t xmit_fifo_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); uart_get_info(port, &tmp); return sprintf(buf, "%d\n", tmp.xmit_fifo_size); } static ssize_t close_delay_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); uart_get_info(port, &tmp); return sprintf(buf, "%d\n", tmp.close_delay); } static ssize_t closing_wait_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); uart_get_info(port, &tmp); return sprintf(buf, "%d\n", tmp.closing_wait); } static ssize_t custom_divisor_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); uart_get_info(port, &tmp); return sprintf(buf, "%d\n", tmp.custom_divisor); } static ssize_t io_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); uart_get_info(port, &tmp); return sprintf(buf, "%d\n", tmp.io_type); } static ssize_t iomem_base_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); uart_get_info(port, &tmp); return sprintf(buf, "0x%lX\n", (unsigned long)tmp.iomem_base); } static ssize_t iomem_reg_shift_show(struct device *dev, struct device_attribute *attr, char *buf) { struct serial_struct tmp; struct tty_port *port = dev_get_drvdata(dev); uart_get_info(port, &tmp); return sprintf(buf, "%d\n", tmp.iomem_reg_shift); } static ssize_t console_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tty_port *port = dev_get_drvdata(dev); struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport; bool console = false; mutex_lock(&port->mutex); uport = uart_port_check(state); if (uport) console = uart_console_registered(uport); mutex_unlock(&port->mutex); return sprintf(buf, "%c\n", console ? 'Y' : 'N'); } static ssize_t console_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct tty_port *port = dev_get_drvdata(dev); struct uart_state *state = container_of(port, struct uart_state, port); struct uart_port *uport; bool oldconsole, newconsole; int ret; ret = kstrtobool(buf, &newconsole); if (ret) return ret; mutex_lock(&port->mutex); uport = uart_port_check(state); if (uport) { oldconsole = uart_console_registered(uport); if (oldconsole && !newconsole) { ret = unregister_console(uport->cons); } else if (!oldconsole && newconsole) { if (uart_console(uport)) { uport->console_reinit = 1; register_console(uport->cons); } else { ret = -ENOENT; } } } else { ret = -ENXIO; } mutex_unlock(&port->mutex); return ret < 0 ? ret : count; } static DEVICE_ATTR_RO(uartclk); static DEVICE_ATTR_RO(type); static DEVICE_ATTR_RO(line); static DEVICE_ATTR_RO(port); static DEVICE_ATTR_RO(irq); static DEVICE_ATTR_RO(flags); static DEVICE_ATTR_RO(xmit_fifo_size); static DEVICE_ATTR_RO(close_delay); static DEVICE_ATTR_RO(closing_wait); static DEVICE_ATTR_RO(custom_divisor); static DEVICE_ATTR_RO(io_type); static DEVICE_ATTR_RO(iomem_base); static DEVICE_ATTR_RO(iomem_reg_shift); static DEVICE_ATTR_RW(console); static struct attribute *tty_dev_attrs[] = { &dev_attr_uartclk.attr, &dev_attr_type.attr, &dev_attr_line.attr, &dev_attr_port.attr, &dev_attr_irq.attr, &dev_attr_flags.attr, &dev_attr_xmit_fifo_size.attr, &dev_attr_close_delay.attr, &dev_attr_closing_wait.attr, &dev_attr_custom_divisor.attr, &dev_attr_io_type.attr, &dev_attr_iomem_base.attr, &dev_attr_iomem_reg_shift.attr, &dev_attr_console.attr, NULL }; static const struct attribute_group tty_dev_attr_group = { .attrs = tty_dev_attrs, }; /** * serial_core_add_one_port - attach a driver-defined port structure * @drv: pointer to the uart low level driver structure for this port * @uport: uart port structure to use for this port. * * Context: task context, might sleep * * This allows the driver @drv to register its own uart_port structure with the * core driver. The main purpose is to allow the low level uart drivers to * expand uart_port, rather than having yet more levels of structures. * Caller must hold port_mutex. */ static int serial_core_add_one_port(struct uart_driver *drv, struct uart_port *uport) { struct uart_state *state; struct tty_port *port; int ret = 0; struct device *tty_dev; int num_groups; if (uport->line >= drv->nr) return -EINVAL; state = drv->state + uport->line; port = &state->port; mutex_lock(&port->mutex); if (state->uart_port) { ret = -EINVAL; goto out; } /* Link the port to the driver state table and vice versa */ atomic_set(&state->refcount, 1); init_waitqueue_head(&state->remove_wait); state->uart_port = uport; uport->state = state; /* * If this port is in use as a console then the spinlock is already * initialised. */ if (!uart_console_registered(uport)) uart_port_spin_lock_init(uport); state->pm_state = UART_PM_STATE_UNDEFINED; uart_port_set_cons(uport, drv->cons); uport->minor = drv->tty_driver->minor_start + uport->line; uport->name = kasprintf(GFP_KERNEL, "%s%d", drv->dev_name, drv->tty_driver->name_base + uport->line); if (!uport->name) { ret = -ENOMEM; goto out; } if (uport->cons && uport->dev) of_console_check(uport->dev->of_node, uport->cons->name, uport->line); tty_port_link_device(port, drv->tty_driver, uport->line); uart_configure_port(drv, state, uport); port->console = uart_console(uport); num_groups = 2; if (uport->attr_group) num_groups++; uport->tty_groups = kcalloc(num_groups, sizeof(*uport->tty_groups), GFP_KERNEL); if (!uport->tty_groups) { ret = -ENOMEM; goto out; } uport->tty_groups[0] = &tty_dev_attr_group; if (uport->attr_group) uport->tty_groups[1] = uport->attr_group; /* Ensure serdev drivers can call serdev_device_open() right away */ uport->flags &= ~UPF_DEAD; /* * Register the port whether it's detected or not. This allows * setserial to be used to alter this port's parameters. */ tty_dev = tty_port_register_device_attr_serdev(port, drv->tty_driver, uport->line, uport->dev, &uport->port_dev->dev, port, uport->tty_groups); if (!IS_ERR(tty_dev)) { device_set_wakeup_capable(tty_dev, 1); } else { uport->flags |= UPF_DEAD; dev_err(uport->dev, "Cannot register tty device on line %d\n", uport->line); } out: mutex_unlock(&port->mutex); return ret; } /** * serial_core_remove_one_port - detach a driver defined port structure * @drv: pointer to the uart low level driver structure for this port * @uport: uart port structure for this port * * Context: task context, might sleep * * This unhooks (and hangs up) the specified port structure from the core * driver. No further calls will be made to the low-level code for this port. * Caller must hold port_mutex. */ static void serial_core_remove_one_port(struct uart_driver *drv, struct uart_port *uport) { struct uart_state *state = drv->state + uport->line; struct tty_port *port = &state->port; struct uart_port *uart_port; struct tty_struct *tty; mutex_lock(&port->mutex); uart_port = uart_port_check(state); if (uart_port != uport) dev_alert(uport->dev, "Removing wrong port: %p != %p\n", uart_port, uport); if (!uart_port) { mutex_unlock(&port->mutex); return; } mutex_unlock(&port->mutex); /* * Remove the devices from the tty layer */ tty_port_unregister_device(port, drv->tty_driver, uport->line); tty = tty_port_tty_get(port); if (tty) { tty_vhangup(port->tty); tty_kref_put(tty); } /* * If the port is used as a console, unregister it */ if (uart_console(uport)) unregister_console(uport->cons); /* * Free the port IO and memory resources, if any. */ if (uport->type != PORT_UNKNOWN && uport->ops->release_port) uport->ops->release_port(uport); kfree(uport->tty_groups); kfree(uport->name); /* * Indicate that there isn't a port here anymore. */ uport->type = PORT_UNKNOWN; uport->port_dev = NULL; mutex_lock(&port->mutex); WARN_ON(atomic_dec_return(&state->refcount) < 0); wait_event(state->remove_wait, !atomic_read(&state->refcount)); state->uart_port = NULL; mutex_unlock(&port->mutex); } /** * uart_match_port - are the two ports equivalent? * @port1: first port * @port2: second port * * This utility function can be used to determine whether two uart_port * structures describe the same port. */ bool uart_match_port(const struct uart_port *port1, const struct uart_port *port2) { if (port1->iotype != port2->iotype) return false; switch (port1->iotype) { case UPIO_PORT: return port1->iobase == port2->iobase; case UPIO_HUB6: return port1->iobase == port2->iobase && port1->hub6 == port2->hub6; case UPIO_MEM: case UPIO_MEM16: case UPIO_MEM32: case UPIO_MEM32BE: case UPIO_AU: case UPIO_TSI: return port1->mapbase == port2->mapbase; } return false; } EXPORT_SYMBOL(uart_match_port); static struct serial_ctrl_device * serial_core_get_ctrl_dev(struct serial_port_device *port_dev) { struct device *dev = &port_dev->dev; return to_serial_base_ctrl_device(dev->parent); } /* * Find a registered serial core controller device if one exists. Returns * the first device matching the ctrl_id. Caller must hold port_mutex. */ static struct serial_ctrl_device *serial_core_ctrl_find(struct uart_driver *drv, struct device *phys_dev, int ctrl_id) { struct uart_state *state; int i; lockdep_assert_held(&port_mutex); for (i = 0; i < drv->nr; i++) { state = drv->state + i; if (!state->uart_port || !state->uart_port->port_dev) continue; if (state->uart_port->dev == phys_dev && state->uart_port->ctrl_id == ctrl_id) return serial_core_get_ctrl_dev(state->uart_port->port_dev); } return NULL; } static struct serial_ctrl_device *serial_core_ctrl_device_add(struct uart_port *port) { return serial_base_ctrl_add(port, port->dev); } static int serial_core_port_device_add(struct serial_ctrl_device *ctrl_dev, struct uart_port *port) { struct serial_port_device *port_dev; port_dev = serial_base_port_add(port, ctrl_dev); if (IS_ERR(port_dev)) return PTR_ERR(port_dev); port->port_dev = port_dev; return 0; } /* * Initialize a serial core port device, and a controller device if needed. */ int serial_core_register_port(struct uart_driver *drv, struct uart_port *port) { struct serial_ctrl_device *ctrl_dev, *new_ctrl_dev = NULL; int ret; mutex_lock(&port_mutex); /* * Prevent serial_port_runtime_resume() from trying to use the port * until serial_core_add_one_port() has completed */ port->flags |= UPF_DEAD; /* Inititalize a serial core controller device if needed */ ctrl_dev = serial_core_ctrl_find(drv, port->dev, port->ctrl_id); if (!ctrl_dev) { new_ctrl_dev = serial_core_ctrl_device_add(port); if (IS_ERR(new_ctrl_dev)) { ret = PTR_ERR(new_ctrl_dev); goto err_unlock; } ctrl_dev = new_ctrl_dev; } /* * Initialize a serial core port device. Tag the port dead to prevent * serial_port_runtime_resume() trying to do anything until port has * been registered. It gets cleared by serial_core_add_one_port(). */ ret = serial_core_port_device_add(ctrl_dev, port); if (ret) goto err_unregister_ctrl_dev; ret = serial_base_match_and_update_preferred_console(drv, port); if (ret) goto err_unregister_port_dev; ret = serial_core_add_one_port(drv, port); if (ret) goto err_unregister_port_dev; mutex_unlock(&port_mutex); return 0; err_unregister_port_dev: serial_base_port_device_remove(port->port_dev); err_unregister_ctrl_dev: serial_base_ctrl_device_remove(new_ctrl_dev); err_unlock: mutex_unlock(&port_mutex); return ret; } /* * Removes a serial core port device, and the related serial core controller * device if the last instance. */ void serial_core_unregister_port(struct uart_driver *drv, struct uart_port *port) { struct device *phys_dev = port->dev; struct serial_port_device *port_dev = port->port_dev; struct serial_ctrl_device *ctrl_dev = serial_core_get_ctrl_dev(port_dev); int ctrl_id = port->ctrl_id; mutex_lock(&port_mutex); port->flags |= UPF_DEAD; serial_core_remove_one_port(drv, port); /* Note that struct uart_port *port is no longer valid at this point */ serial_base_port_device_remove(port_dev); /* Drop the serial core controller device if no ports are using it */ if (!serial_core_ctrl_find(drv, phys_dev, ctrl_id)) serial_base_ctrl_device_remove(ctrl_dev); mutex_unlock(&port_mutex); } /** * uart_handle_dcd_change - handle a change of carrier detect state * @uport: uart_port structure for the open port * @active: new carrier detect status * * Caller must hold uport->lock. */ void uart_handle_dcd_change(struct uart_port *uport, bool active) { struct tty_port *port = &uport->state->port; struct tty_struct *tty = port->tty; struct tty_ldisc *ld; lockdep_assert_held_once(&uport->lock); if (tty) { ld = tty_ldisc_ref(tty); if (ld) { if (ld->ops->dcd_change) ld->ops->dcd_change(tty, active); tty_ldisc_deref(ld); } } uport->icount.dcd++; if (uart_dcd_enabled(uport)) { if (active) wake_up_interruptible(&port->open_wait); else if (tty) tty_hangup(tty); } } EXPORT_SYMBOL_GPL(uart_handle_dcd_change); /** * uart_handle_cts_change - handle a change of clear-to-send state * @uport: uart_port structure for the open port * @active: new clear-to-send status * * Caller must hold uport->lock. */ void uart_handle_cts_change(struct uart_port *uport, bool active) { lockdep_assert_held_once(&uport->lock); uport->icount.cts++; if (uart_softcts_mode(uport)) { if (uport->hw_stopped) { if (active) { uport->hw_stopped = false; uport->ops->start_tx(uport); uart_write_wakeup(uport); } } else { if (!active) { uport->hw_stopped = true; uport->ops->stop_tx(uport); } } } } EXPORT_SYMBOL_GPL(uart_handle_cts_change); /** * uart_insert_char - push a char to the uart layer * * User is responsible to call tty_flip_buffer_push when they are done with * insertion. * * @port: corresponding port * @status: state of the serial port RX buffer (LSR for 8250) * @overrun: mask of overrun bits in @status * @ch: character to push * @flag: flag for the character (see TTY_NORMAL and friends) */ void uart_insert_char(struct uart_port *port, unsigned int status, unsigned int overrun, u8 ch, u8 flag) { struct tty_port *tport = &port->state->port; if ((status & port->ignore_status_mask & ~overrun) == 0) if (tty_insert_flip_char(tport, ch, flag) == 0) ++port->icount.buf_overrun; /* * Overrun is special. Since it's reported immediately, * it doesn't affect the current character. */ if (status & ~port->ignore_status_mask & overrun) if (tty_insert_flip_char(tport, 0, TTY_OVERRUN) == 0) ++port->icount.buf_overrun; } EXPORT_SYMBOL_GPL(uart_insert_char); #ifdef CONFIG_MAGIC_SYSRQ_SERIAL static const u8 sysrq_toggle_seq[] = CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE; static void uart_sysrq_on(struct work_struct *w) { int sysrq_toggle_seq_len = strlen(sysrq_toggle_seq); sysrq_toggle_support(1); pr_info("SysRq is enabled by magic sequence '%*pE' on serial\n", sysrq_toggle_seq_len, sysrq_toggle_seq); } static DECLARE_WORK(sysrq_enable_work, uart_sysrq_on); /** * uart_try_toggle_sysrq - Enables SysRq from serial line * @port: uart_port structure where char(s) after BREAK met * @ch: new character in the sequence after received BREAK * * Enables magic SysRq when the required sequence is met on port * (see CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE). * * Returns: %false if @ch is out of enabling sequence and should be * handled some other way, %true if @ch was consumed. */ bool uart_try_toggle_sysrq(struct uart_port *port, u8 ch) { int sysrq_toggle_seq_len = strlen(sysrq_toggle_seq); if (!sysrq_toggle_seq_len) return false; BUILD_BUG_ON(ARRAY_SIZE(sysrq_toggle_seq) >= U8_MAX); if (sysrq_toggle_seq[port->sysrq_seq] != ch) { port->sysrq_seq = 0; return false; } if (++port->sysrq_seq < sysrq_toggle_seq_len) { port->sysrq = jiffies + SYSRQ_TIMEOUT; return true; } schedule_work(&sysrq_enable_work); port->sysrq = 0; return true; } EXPORT_SYMBOL_GPL(uart_try_toggle_sysrq); #endif /** * uart_get_rs485_mode() - retrieve rs485 properties for given uart * @port: uart device's target port * * This function implements the device tree binding described in * Documentation/devicetree/bindings/serial/rs485.txt. */ int uart_get_rs485_mode(struct uart_port *port) { struct serial_rs485 *rs485conf = &port->rs485; struct device *dev = port->dev; enum gpiod_flags dflags; struct gpio_desc *desc; u32 rs485_delay[2]; int ret; if (!(port->rs485_supported.flags & SER_RS485_ENABLED)) return 0; ret = device_property_read_u32_array(dev, "rs485-rts-delay", rs485_delay, 2); if (!ret) { rs485conf->delay_rts_before_send = rs485_delay[0]; rs485conf->delay_rts_after_send = rs485_delay[1]; } else { rs485conf->delay_rts_before_send = 0; rs485conf->delay_rts_after_send = 0; } uart_sanitize_serial_rs485_delays(port, rs485conf); /* * Clear full-duplex and enabled flags, set RTS polarity to active high * to get to a defined state with the following properties: */ rs485conf->flags &= ~(SER_RS485_RX_DURING_TX | SER_RS485_ENABLED | SER_RS485_TERMINATE_BUS | SER_RS485_RTS_AFTER_SEND); rs485conf->flags |= SER_RS485_RTS_ON_SEND; if (device_property_read_bool(dev, "rs485-rx-during-tx")) rs485conf->flags |= SER_RS485_RX_DURING_TX; if (device_property_read_bool(dev, "linux,rs485-enabled-at-boot-time")) rs485conf->flags |= SER_RS485_ENABLED; if (device_property_read_bool(dev, "rs485-rts-active-low")) { rs485conf->flags &= ~SER_RS485_RTS_ON_SEND; rs485conf->flags |= SER_RS485_RTS_AFTER_SEND; } /* * Disabling termination by default is the safe choice: Else if many * bus participants enable it, no communication is possible at all. * Works fine for short cables and users may enable for longer cables. */ desc = devm_gpiod_get_optional(dev, "rs485-term", GPIOD_OUT_LOW); if (IS_ERR(desc)) return dev_err_probe(dev, PTR_ERR(desc), "Cannot get rs485-term-gpios\n"); port->rs485_term_gpio = desc; if (port->rs485_term_gpio) port->rs485_supported.flags |= SER_RS485_TERMINATE_BUS; dflags = (rs485conf->flags & SER_RS485_RX_DURING_TX) ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW; desc = devm_gpiod_get_optional(dev, "rs485-rx-during-tx", dflags); if (IS_ERR(desc)) return dev_err_probe(dev, PTR_ERR(desc), "Cannot get rs485-rx-during-tx-gpios\n"); port->rs485_rx_during_tx_gpio = desc; if (port->rs485_rx_during_tx_gpio) port->rs485_supported.flags |= SER_RS485_RX_DURING_TX; return 0; } EXPORT_SYMBOL_GPL(uart_get_rs485_mode); /* Compile-time assertions for serial_rs485 layout */ static_assert(offsetof(struct serial_rs485, padding) == (offsetof(struct serial_rs485, delay_rts_after_send) + sizeof(__u32))); static_assert(offsetof(struct serial_rs485, padding1) == offsetof(struct serial_rs485, padding[1])); static_assert((offsetof(struct serial_rs485, padding[4]) + sizeof(__u32)) == sizeof(struct serial_rs485)); MODULE_DESCRIPTION("Serial driver core"); MODULE_LICENSE("GPL"); |
9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 | /* * * dvb_ringbuffer.c: ring buffer implementation for the dvb driver * * Copyright (C) 2003 Oliver Endriss * Copyright (C) 2004 Andrew de Quincey * * based on code originally found in av7110.c & dvb_ci.c: * Copyright (C) 1999-2003 Ralph Metzler * & Marcus Metzler for convergence integrated media GmbH * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2.1 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/uaccess.h> #include <media/dvb_ringbuffer.h> #define PKT_READY 0 #define PKT_DISPOSED 1 void dvb_ringbuffer_init(struct dvb_ringbuffer *rbuf, void *data, size_t len) { rbuf->pread=rbuf->pwrite=0; rbuf->data=data; rbuf->size=len; rbuf->error=0; init_waitqueue_head(&rbuf->queue); spin_lock_init(&(rbuf->lock)); } int dvb_ringbuffer_empty(struct dvb_ringbuffer *rbuf) { /* smp_load_acquire() to load write pointer on reader side * this pairs with smp_store_release() in dvb_ringbuffer_write(), * dvb_ringbuffer_write_user(), or dvb_ringbuffer_reset() * * for memory barriers also see Documentation/core-api/circular-buffers.rst */ return (rbuf->pread == smp_load_acquire(&rbuf->pwrite)); } ssize_t dvb_ringbuffer_free(struct dvb_ringbuffer *rbuf) { ssize_t free; /* READ_ONCE() to load read pointer on writer side * this pairs with smp_store_release() in dvb_ringbuffer_read(), * dvb_ringbuffer_read_user(), dvb_ringbuffer_flush(), * or dvb_ringbuffer_reset() */ free = READ_ONCE(rbuf->pread) - rbuf->pwrite; if (free <= 0) free += rbuf->size; return free-1; } ssize_t dvb_ringbuffer_avail(struct dvb_ringbuffer *rbuf) { ssize_t avail; /* smp_load_acquire() to load write pointer on reader side * this pairs with smp_store_release() in dvb_ringbuffer_write(), * dvb_ringbuffer_write_user(), or dvb_ringbuffer_reset() */ avail = smp_load_acquire(&rbuf->pwrite) - rbuf->pread; if (avail < 0) avail += rbuf->size; return avail; } void dvb_ringbuffer_flush(struct dvb_ringbuffer *rbuf) { /* dvb_ringbuffer_flush() counts as read operation * smp_load_acquire() to load write pointer * smp_store_release() to update read pointer, this ensures that the * correct pointer is visible for subsequent dvb_ringbuffer_free() * calls on other cpu cores */ smp_store_release(&rbuf->pread, smp_load_acquire(&rbuf->pwrite)); rbuf->error = 0; } EXPORT_SYMBOL(dvb_ringbuffer_flush); void dvb_ringbuffer_reset(struct dvb_ringbuffer *rbuf) { /* dvb_ringbuffer_reset() counts as read and write operation * smp_store_release() to update read pointer */ smp_store_release(&rbuf->pread, 0); /* smp_store_release() to update write pointer */ smp_store_release(&rbuf->pwrite, 0); rbuf->error = 0; } void dvb_ringbuffer_flush_spinlock_wakeup(struct dvb_ringbuffer *rbuf) { unsigned long flags; spin_lock_irqsave(&rbuf->lock, flags); dvb_ringbuffer_flush(rbuf); spin_unlock_irqrestore(&rbuf->lock, flags); wake_up(&rbuf->queue); } ssize_t dvb_ringbuffer_read_user(struct dvb_ringbuffer *rbuf, u8 __user *buf, size_t len) { size_t todo = len; size_t split; split = (rbuf->pread + len > rbuf->size) ? rbuf->size - rbuf->pread : 0; if (split > 0) { if (copy_to_user(buf, rbuf->data+rbuf->pread, split)) return -EFAULT; buf += split; todo -= split; /* smp_store_release() for read pointer update to ensure * that buf is not overwritten until read is complete, * this pairs with READ_ONCE() in dvb_ringbuffer_free() */ smp_store_release(&rbuf->pread, 0); } if (copy_to_user(buf, rbuf->data+rbuf->pread, todo)) return -EFAULT; /* smp_store_release() to update read pointer, see above */ smp_store_release(&rbuf->pread, (rbuf->pread + todo) % rbuf->size); return len; } void dvb_ringbuffer_read(struct dvb_ringbuffer *rbuf, u8 *buf, size_t len) { size_t todo = len; size_t split; split = (rbuf->pread + len > rbuf->size) ? rbuf->size - rbuf->pread : 0; if (split > 0) { memcpy(buf, rbuf->data+rbuf->pread, split); buf += split; todo -= split; /* smp_store_release() for read pointer update to ensure * that buf is not overwritten until read is complete, * this pairs with READ_ONCE() in dvb_ringbuffer_free() */ smp_store_release(&rbuf->pread, 0); } memcpy(buf, rbuf->data+rbuf->pread, todo); /* smp_store_release() to update read pointer, see above */ smp_store_release(&rbuf->pread, (rbuf->pread + todo) % rbuf->size); } ssize_t dvb_ringbuffer_write(struct dvb_ringbuffer *rbuf, const u8 *buf, size_t len) { size_t todo = len; size_t split; split = (rbuf->pwrite + len > rbuf->size) ? rbuf->size - rbuf->pwrite : 0; if (split > 0) { memcpy(rbuf->data+rbuf->pwrite, buf, split); buf += split; todo -= split; /* smp_store_release() for write pointer update to ensure that * written data is visible on other cpu cores before the pointer * update, this pairs with smp_load_acquire() in * dvb_ringbuffer_empty() or dvb_ringbuffer_avail() */ smp_store_release(&rbuf->pwrite, 0); } memcpy(rbuf->data+rbuf->pwrite, buf, todo); /* smp_store_release() for write pointer update, see above */ smp_store_release(&rbuf->pwrite, (rbuf->pwrite + todo) % rbuf->size); return len; } ssize_t dvb_ringbuffer_write_user(struct dvb_ringbuffer *rbuf, const u8 __user *buf, size_t len) { int status; size_t todo = len; size_t split; split = (rbuf->pwrite + len > rbuf->size) ? rbuf->size - rbuf->pwrite : 0; if (split > 0) { status = copy_from_user(rbuf->data+rbuf->pwrite, buf, split); if (status) return len - todo; buf += split; todo -= split; /* smp_store_release() for write pointer update to ensure that * written data is visible on other cpu cores before the pointer * update, this pairs with smp_load_acquire() in * dvb_ringbuffer_empty() or dvb_ringbuffer_avail() */ smp_store_release(&rbuf->pwrite, 0); } status = copy_from_user(rbuf->data+rbuf->pwrite, buf, todo); if (status) return len - todo; /* smp_store_release() for write pointer update, see above */ smp_store_release(&rbuf->pwrite, (rbuf->pwrite + todo) % rbuf->size); return len; } ssize_t dvb_ringbuffer_pkt_write(struct dvb_ringbuffer *rbuf, u8* buf, size_t len) { int status; ssize_t oldpwrite = rbuf->pwrite; DVB_RINGBUFFER_WRITE_BYTE(rbuf, len >> 8); DVB_RINGBUFFER_WRITE_BYTE(rbuf, len & 0xff); DVB_RINGBUFFER_WRITE_BYTE(rbuf, PKT_READY); status = dvb_ringbuffer_write(rbuf, buf, len); if (status < 0) rbuf->pwrite = oldpwrite; return status; } ssize_t dvb_ringbuffer_pkt_read_user(struct dvb_ringbuffer *rbuf, size_t idx, int offset, u8 __user *buf, size_t len) { size_t todo; size_t split; size_t pktlen; pktlen = rbuf->data[idx] << 8; pktlen |= rbuf->data[(idx + 1) % rbuf->size]; if (offset > pktlen) return -EINVAL; if ((offset + len) > pktlen) len = pktlen - offset; idx = (idx + DVB_RINGBUFFER_PKTHDRSIZE + offset) % rbuf->size; todo = len; split = ((idx + len) > rbuf->size) ? rbuf->size - idx : 0; if (split > 0) { if (copy_to_user(buf, rbuf->data+idx, split)) return -EFAULT; buf += split; todo -= split; idx = 0; } if (copy_to_user(buf, rbuf->data+idx, todo)) return -EFAULT; return len; } ssize_t dvb_ringbuffer_pkt_read(struct dvb_ringbuffer *rbuf, size_t idx, int offset, u8* buf, size_t len) { size_t todo; size_t split; size_t pktlen; pktlen = rbuf->data[idx] << 8; pktlen |= rbuf->data[(idx + 1) % rbuf->size]; if (offset > pktlen) return -EINVAL; if ((offset + len) > pktlen) len = pktlen - offset; idx = (idx + DVB_RINGBUFFER_PKTHDRSIZE + offset) % rbuf->size; todo = len; split = ((idx + len) > rbuf->size) ? rbuf->size - idx : 0; if (split > 0) { memcpy(buf, rbuf->data+idx, split); buf += split; todo -= split; idx = 0; } memcpy(buf, rbuf->data+idx, todo); return len; } void dvb_ringbuffer_pkt_dispose(struct dvb_ringbuffer *rbuf, size_t idx) { size_t pktlen; rbuf->data[(idx + 2) % rbuf->size] = PKT_DISPOSED; // clean up disposed packets while(dvb_ringbuffer_avail(rbuf) > DVB_RINGBUFFER_PKTHDRSIZE) { if (DVB_RINGBUFFER_PEEK(rbuf, 2) == PKT_DISPOSED) { pktlen = DVB_RINGBUFFER_PEEK(rbuf, 0) << 8; pktlen |= DVB_RINGBUFFER_PEEK(rbuf, 1); DVB_RINGBUFFER_SKIP(rbuf, pktlen + DVB_RINGBUFFER_PKTHDRSIZE); } else { // first packet is not disposed, so we stop cleaning now break; } } } ssize_t dvb_ringbuffer_pkt_next(struct dvb_ringbuffer *rbuf, size_t idx, size_t* pktlen) { int consumed; int curpktlen; int curpktstatus; if (idx == -1) { idx = rbuf->pread; } else { curpktlen = rbuf->data[idx] << 8; curpktlen |= rbuf->data[(idx + 1) % rbuf->size]; idx = (idx + curpktlen + DVB_RINGBUFFER_PKTHDRSIZE) % rbuf->size; } consumed = (idx - rbuf->pread); if (consumed < 0) consumed += rbuf->size; while((dvb_ringbuffer_avail(rbuf) - consumed) > DVB_RINGBUFFER_PKTHDRSIZE) { curpktlen = rbuf->data[idx] << 8; curpktlen |= rbuf->data[(idx + 1) % rbuf->size]; curpktstatus = rbuf->data[(idx + 2) % rbuf->size]; if (curpktstatus == PKT_READY) { *pktlen = curpktlen; return idx; } consumed += curpktlen + DVB_RINGBUFFER_PKTHDRSIZE; idx = (idx + curpktlen + DVB_RINGBUFFER_PKTHDRSIZE) % rbuf->size; } // no packets available return -1; } EXPORT_SYMBOL(dvb_ringbuffer_init); EXPORT_SYMBOL(dvb_ringbuffer_empty); EXPORT_SYMBOL(dvb_ringbuffer_free); EXPORT_SYMBOL(dvb_ringbuffer_avail); EXPORT_SYMBOL(dvb_ringbuffer_flush_spinlock_wakeup); EXPORT_SYMBOL(dvb_ringbuffer_read_user); EXPORT_SYMBOL(dvb_ringbuffer_read); EXPORT_SYMBOL(dvb_ringbuffer_write); EXPORT_SYMBOL(dvb_ringbuffer_write_user); |
3128 3130 2 2810 174 167 2 1 2 2 14487 14455 14481 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | // SPDX-License-Identifier: GPL-2.0 #include <linux/irq_work.h> #include <linux/spinlock.h> #include <linux/task_work.h> #include <linux/resume_user_mode.h> static struct callback_head work_exited; /* all we need is ->next == NULL */ #ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); #endif /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback * @work: the callback to run * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify * is @TWA_RESUME, @TWA_SIGNAL, @TWA_SIGNAL_NO_IPI or @TWA_NMI_CURRENT. * * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted * task and run the task_work, regardless of whether the task is currently * running in the kernel or userspace. * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a * reschedule IPI to force the targeted task to reschedule and run task_work. * This can be advantageous if there's no strict requirement that the * task_work be run as soon as possible, just whenever the task enters the * kernel anyway. * @TWA_RESUME work is run only when the task exits the kernel and returns to * user mode, or before entering guest mode. * @TWA_NMI_CURRENT works like @TWA_RESUME, except it can only be used for the * current @task and if the current context is NMI. * * Fails if the @task is exiting/exited and thus it can't process this @work. * Otherwise @work->func() will be called when the @task goes through one of * the aforementioned transitions, or exits. * * If the targeted task is exiting, then an error is returned and the work item * is not queued. It's up to the caller to arrange for an alternative mechanism * in that case. * * Note: there is no ordering guarantee on works queued here. The task_work * list is LIFO. * * RETURNS: * 0 if succeeds or -ESRCH. */ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; int flags = notify & TWA_FLAGS; notify &= ~TWA_FLAGS; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { /* * Record the work call stack in order to print it in KASAN * reports. * * Note that stack allocation can fail if TWAF_NO_ALLOC flag * is set and new page is needed to expand the stack buffer. */ if (flags & TWAF_NO_ALLOC) kasan_record_aux_stack_noalloc(work); else kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works); do { if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; } while (!try_cmpxchg(&task->task_works, &head, work)); switch (notify) { case TWA_NONE: break; case TWA_RESUME: set_notify_resume(task); break; case TWA_SIGNAL: set_notify_signal(task); break; case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; #ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; #endif default: WARN_ON_ONCE(1); break; } return 0; } /** * task_work_cancel_match - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @match: match function to call * @data: data to be passed in to match function * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data) { struct callback_head **pprev = &task->task_works; struct callback_head *work; unsigned long flags; if (likely(!task_work_pending(task))) return NULL; /* * If cmpxchg() fails we continue without updating pprev. * Either we raced with task_work_add() which added the * new entry before this work, we will find it again. Or * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); work = READ_ONCE(*pprev); while (work) { if (!match(work, data)) { pprev = &work->next; work = READ_ONCE(*pprev); } else if (try_cmpxchg(pprev, &work, work->next)) break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); return work; } static bool task_work_func_match(struct callback_head *cb, void *data) { return cb->func == data; } /** * task_work_cancel_func - cancel a pending work matching a function added by task_work_add() * @task: the task which should execute the func's work * @func: identifies the func to match with a work to remove * * Find the last queued pending work with ->func == @func and remove * it from queue. * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel_func(struct task_struct *task, task_work_func_t func) { return task_work_cancel_match(task, task_work_func_match, func); } static bool task_work_match(struct callback_head *cb, void *data) { return cb == data; } /** * task_work_cancel - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @cb: the callback to remove if queued * * Remove a callback from a task's queue if queued. * * RETURNS: * True if the callback was queued and got cancelled, false otherwise. */ bool task_work_cancel(struct task_struct *task, struct callback_head *cb) { struct callback_head *ret; ret = task_work_cancel_match(task, task_work_match, cb); return ret == cb; } /** * task_work_run - execute the works added by task_work_add() * * Flush the pending works. Should be used by the core kernel code. * Called before the task returns to the user-mode or stops, or when * it exits. In the latter case task_work_add() can no longer add the * new work after task_work_run() returns. */ void task_work_run(void) { struct task_struct *task = current; struct callback_head *work, *head, *next; for (;;) { /* * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ work = READ_ONCE(task->task_works); do { head = NULL; if (!work) { if (task->flags & PF_EXITING) head = &work_exited; else break; } } while (!try_cmpxchg(&task->task_works, &work, head)); if (!work) break; /* * Synchronize with task_work_cancel_match(). It can not remove * the first entry == work, cmpxchg(task_works) must fail. * But it can remove another entry from the ->next list. */ raw_spin_lock_irq(&task->pi_lock); raw_spin_unlock_irq(&task->pi_lock); do { next = work->next; work->func(work); work = next; cond_resched(); } while (work); } } |
12 8 4 3 12 12 12 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 | // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2003-2008 Takahiro Hirofuchi * Copyright (C) 2015-2016 Nobuo Iwata */ #include <linux/init.h> #include <linux/file.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/platform_device.h> #include <linux/slab.h> #include "usbip_common.h" #include "vhci.h" #define DRIVER_AUTHOR "Takahiro Hirofuchi" #define DRIVER_DESC "USB/IP 'Virtual' Host Controller (VHCI) Driver" /* * TODO * - update root hub emulation * - move the emulation code to userland ? * porting to other operating systems * minimize kernel code * - add suspend/resume code * - clean up everything */ /* See usb gadget dummy hcd */ static int vhci_hub_status(struct usb_hcd *hcd, char *buff); static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buff, u16 wLength); static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags); static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status); static int vhci_start(struct usb_hcd *vhci_hcd); static void vhci_stop(struct usb_hcd *hcd); static int vhci_get_frame_number(struct usb_hcd *hcd); static const char driver_name[] = "vhci_hcd"; static const char driver_desc[] = "USB/IP Virtual Host Controller"; int vhci_num_controllers = VHCI_NR_HCS; struct vhci *vhcis; static const char * const bit_desc[] = { "CONNECTION", /*0*/ "ENABLE", /*1*/ "SUSPEND", /*2*/ "OVER_CURRENT", /*3*/ "RESET", /*4*/ "L1", /*5*/ "R6", /*6*/ "R7", /*7*/ "POWER", /*8*/ "LOWSPEED", /*9*/ "HIGHSPEED", /*10*/ "PORT_TEST", /*11*/ "INDICATOR", /*12*/ "R13", /*13*/ "R14", /*14*/ "R15", /*15*/ "C_CONNECTION", /*16*/ "C_ENABLE", /*17*/ "C_SUSPEND", /*18*/ "C_OVER_CURRENT", /*19*/ "C_RESET", /*20*/ "C_L1", /*21*/ "R22", /*22*/ "R23", /*23*/ "R24", /*24*/ "R25", /*25*/ "R26", /*26*/ "R27", /*27*/ "R28", /*28*/ "R29", /*29*/ "R30", /*30*/ "R31", /*31*/ }; static const char * const bit_desc_ss[] = { "CONNECTION", /*0*/ "ENABLE", /*1*/ "SUSPEND", /*2*/ "OVER_CURRENT", /*3*/ "RESET", /*4*/ "L1", /*5*/ "R6", /*6*/ "R7", /*7*/ "R8", /*8*/ "POWER", /*9*/ "HIGHSPEED", /*10*/ "PORT_TEST", /*11*/ "INDICATOR", /*12*/ "R13", /*13*/ "R14", /*14*/ "R15", /*15*/ "C_CONNECTION", /*16*/ "C_ENABLE", /*17*/ "C_SUSPEND", /*18*/ "C_OVER_CURRENT", /*19*/ "C_RESET", /*20*/ "C_BH_RESET", /*21*/ "C_LINK_STATE", /*22*/ "C_CONFIG_ERROR", /*23*/ "R24", /*24*/ "R25", /*25*/ "R26", /*26*/ "R27", /*27*/ "R28", /*28*/ "R29", /*29*/ "R30", /*30*/ "R31", /*31*/ }; static void dump_port_status_diff(u32 prev_status, u32 new_status, bool usb3) { int i = 0; u32 bit = 1; const char * const *desc = bit_desc; if (usb3) desc = bit_desc_ss; pr_debug("status prev -> new: %08x -> %08x\n", prev_status, new_status); while (bit) { u32 prev = prev_status & bit; u32 new = new_status & bit; char change; if (!prev && new) change = '+'; else if (prev && !new) change = '-'; else change = ' '; if (prev || new) { pr_debug(" %c%s\n", change, desc[i]); if (bit == 1) /* USB_PORT_STAT_CONNECTION */ pr_debug(" %c%s\n", change, "USB_PORT_STAT_SPEED_5GBPS"); } bit <<= 1; i++; } pr_debug("\n"); } void rh_port_connect(struct vhci_device *vdev, enum usb_device_speed speed) { struct vhci_hcd *vhci_hcd = vdev_to_vhci_hcd(vdev); struct vhci *vhci = vhci_hcd->vhci; int rhport = vdev->rhport; u32 status; unsigned long flags; usbip_dbg_vhci_rh("rh_port_connect %d\n", rhport); spin_lock_irqsave(&vhci->lock, flags); status = vhci_hcd->port_status[rhport]; status |= USB_PORT_STAT_CONNECTION | (1 << USB_PORT_FEAT_C_CONNECTION); switch (speed) { case USB_SPEED_HIGH: status |= USB_PORT_STAT_HIGH_SPEED; break; case USB_SPEED_LOW: status |= USB_PORT_STAT_LOW_SPEED; break; default: break; } vhci_hcd->port_status[rhport] = status; spin_unlock_irqrestore(&vhci->lock, flags); usb_hcd_poll_rh_status(vhci_hcd_to_hcd(vhci_hcd)); } static void rh_port_disconnect(struct vhci_device *vdev) { struct vhci_hcd *vhci_hcd = vdev_to_vhci_hcd(vdev); struct vhci *vhci = vhci_hcd->vhci; int rhport = vdev->rhport; u32 status; unsigned long flags; usbip_dbg_vhci_rh("rh_port_disconnect %d\n", rhport); spin_lock_irqsave(&vhci->lock, flags); status = vhci_hcd->port_status[rhport]; status &= ~USB_PORT_STAT_CONNECTION; status |= (1 << USB_PORT_FEAT_C_CONNECTION); vhci_hcd->port_status[rhport] = status; spin_unlock_irqrestore(&vhci->lock, flags); usb_hcd_poll_rh_status(vhci_hcd_to_hcd(vhci_hcd)); } #define PORT_C_MASK \ ((USB_PORT_STAT_C_CONNECTION \ | USB_PORT_STAT_C_ENABLE \ | USB_PORT_STAT_C_SUSPEND \ | USB_PORT_STAT_C_OVERCURRENT \ | USB_PORT_STAT_C_RESET) << 16) /* * Returns 0 if the status hasn't changed, or the number of bytes in buf. * Ports are 0-indexed from the HCD point of view, * and 1-indexed from the USB core pointer of view. * * @buf: a bitmap to show which port status has been changed. * bit 0: reserved * bit 1: the status of port 0 has been changed. * bit 2: the status of port 1 has been changed. * ... */ static int vhci_hub_status(struct usb_hcd *hcd, char *buf) { struct vhci_hcd *vhci_hcd = hcd_to_vhci_hcd(hcd); struct vhci *vhci = vhci_hcd->vhci; int retval = DIV_ROUND_UP(VHCI_HC_PORTS + 1, 8); int rhport; int changed = 0; unsigned long flags; memset(buf, 0, retval); spin_lock_irqsave(&vhci->lock, flags); if (!HCD_HW_ACCESSIBLE(hcd)) { usbip_dbg_vhci_rh("hw accessible flag not on?\n"); goto done; } /* check pseudo status register for each port */ for (rhport = 0; rhport < VHCI_HC_PORTS; rhport++) { if ((vhci_hcd->port_status[rhport] & PORT_C_MASK)) { /* The status of a port has been changed, */ usbip_dbg_vhci_rh("port %d status changed\n", rhport); buf[(rhport + 1) / 8] |= 1 << (rhport + 1) % 8; changed = 1; } } if ((hcd->state == HC_STATE_SUSPENDED) && (changed == 1)) usb_hcd_resume_root_hub(hcd); done: spin_unlock_irqrestore(&vhci->lock, flags); return changed ? retval : 0; } /* usb 3.0 root hub device descriptor */ static struct { struct usb_bos_descriptor bos; struct usb_ss_cap_descriptor ss_cap; } __packed usb3_bos_desc = { .bos = { .bLength = USB_DT_BOS_SIZE, .bDescriptorType = USB_DT_BOS, .wTotalLength = cpu_to_le16(sizeof(usb3_bos_desc)), .bNumDeviceCaps = 1, }, .ss_cap = { .bLength = USB_DT_USB_SS_CAP_SIZE, .bDescriptorType = USB_DT_DEVICE_CAPABILITY, .bDevCapabilityType = USB_SS_CAP_TYPE, .wSpeedSupported = cpu_to_le16(USB_5GBPS_OPERATION), .bFunctionalitySupport = ilog2(USB_5GBPS_OPERATION), }, }; static inline void ss_hub_descriptor(struct usb_hub_descriptor *desc) { memset(desc, 0, sizeof *desc); desc->bDescriptorType = USB_DT_SS_HUB; desc->bDescLength = 12; desc->wHubCharacteristics = cpu_to_le16( HUB_CHAR_INDV_PORT_LPSM | HUB_CHAR_COMMON_OCPM); desc->bNbrPorts = VHCI_HC_PORTS; desc->u.ss.bHubHdrDecLat = 0x04; /* Worst case: 0.4 micro sec*/ desc->u.ss.DeviceRemovable = 0xffff; } static inline void hub_descriptor(struct usb_hub_descriptor *desc) { int width; memset(desc, 0, sizeof(*desc)); desc->bDescriptorType = USB_DT_HUB; desc->wHubCharacteristics = cpu_to_le16( HUB_CHAR_INDV_PORT_LPSM | HUB_CHAR_COMMON_OCPM); desc->bNbrPorts = VHCI_HC_PORTS; BUILD_BUG_ON(VHCI_HC_PORTS > USB_MAXCHILDREN); width = desc->bNbrPorts / 8 + 1; desc->bDescLength = USB_DT_HUB_NONVAR_SIZE + 2 * width; memset(&desc->u.hs.DeviceRemovable[0], 0, width); memset(&desc->u.hs.DeviceRemovable[width], 0xff, width); } static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength) { struct vhci_hcd *vhci_hcd; struct vhci *vhci; int retval = 0; int rhport = -1; unsigned long flags; bool invalid_rhport = false; u32 prev_port_status[VHCI_HC_PORTS]; if (!HCD_HW_ACCESSIBLE(hcd)) return -ETIMEDOUT; /* * NOTE: * wIndex (bits 0-7) shows the port number and begins from 1? */ wIndex = ((__u8)(wIndex & 0x00ff)); usbip_dbg_vhci_rh("typeReq %x wValue %x wIndex %x\n", typeReq, wValue, wIndex); /* * wIndex can be 0 for some request types (typeReq). rhport is * in valid range when wIndex >= 1 and < VHCI_HC_PORTS. * * Reference port_status[] only with valid rhport when * invalid_rhport is false. */ if (wIndex < 1 || wIndex > VHCI_HC_PORTS) { invalid_rhport = true; if (wIndex > VHCI_HC_PORTS) pr_err("invalid port number %d\n", wIndex); } else rhport = wIndex - 1; vhci_hcd = hcd_to_vhci_hcd(hcd); vhci = vhci_hcd->vhci; spin_lock_irqsave(&vhci->lock, flags); /* store old status and compare now and old later */ if (usbip_dbg_flag_vhci_rh) { if (!invalid_rhport) memcpy(prev_port_status, vhci_hcd->port_status, sizeof(prev_port_status)); } switch (typeReq) { case ClearHubFeature: usbip_dbg_vhci_rh(" ClearHubFeature\n"); break; case ClearPortFeature: if (invalid_rhport) { pr_err("invalid port number %d\n", wIndex); goto error; } switch (wValue) { case USB_PORT_FEAT_SUSPEND: if (hcd->speed >= HCD_USB3) { pr_err(" ClearPortFeature: USB_PORT_FEAT_SUSPEND req not " "supported for USB 3.0 roothub\n"); goto error; } usbip_dbg_vhci_rh( " ClearPortFeature: USB_PORT_FEAT_SUSPEND\n"); if (vhci_hcd->port_status[rhport] & USB_PORT_STAT_SUSPEND) { /* 20msec signaling */ vhci_hcd->resuming = 1; vhci_hcd->re_timeout = jiffies + msecs_to_jiffies(20); } break; case USB_PORT_FEAT_POWER: usbip_dbg_vhci_rh( " ClearPortFeature: USB_PORT_FEAT_POWER\n"); if (hcd->speed >= HCD_USB3) vhci_hcd->port_status[rhport] &= ~USB_SS_PORT_STAT_POWER; else vhci_hcd->port_status[rhport] &= ~USB_PORT_STAT_POWER; break; default: usbip_dbg_vhci_rh(" ClearPortFeature: default %x\n", wValue); if (wValue >= 32) goto error; vhci_hcd->port_status[rhport] &= ~(1 << wValue); break; } break; case GetHubDescriptor: usbip_dbg_vhci_rh(" GetHubDescriptor\n"); if (hcd->speed >= HCD_USB3 && (wLength < USB_DT_SS_HUB_SIZE || wValue != (USB_DT_SS_HUB << 8))) { pr_err("Wrong hub descriptor type for USB 3.0 roothub.\n"); goto error; } if (hcd->speed >= HCD_USB3) ss_hub_descriptor((struct usb_hub_descriptor *) buf); else hub_descriptor((struct usb_hub_descriptor *) buf); break; case DeviceRequest | USB_REQ_GET_DESCRIPTOR: if (hcd->speed < HCD_USB3) goto error; if ((wValue >> 8) != USB_DT_BOS) goto error; memcpy(buf, &usb3_bos_desc, sizeof(usb3_bos_desc)); retval = sizeof(usb3_bos_desc); break; case GetHubStatus: usbip_dbg_vhci_rh(" GetHubStatus\n"); *(__le32 *) buf = cpu_to_le32(0); break; case GetPortStatus: usbip_dbg_vhci_rh(" GetPortStatus port %x\n", wIndex); if (invalid_rhport) { pr_err("invalid port number %d\n", wIndex); retval = -EPIPE; goto error; } /* we do not care about resume. */ /* whoever resets or resumes must GetPortStatus to * complete it!! */ if (vhci_hcd->resuming && time_after(jiffies, vhci_hcd->re_timeout)) { vhci_hcd->port_status[rhport] |= (1 << USB_PORT_FEAT_C_SUSPEND); vhci_hcd->port_status[rhport] &= ~(1 << USB_PORT_FEAT_SUSPEND); vhci_hcd->resuming = 0; vhci_hcd->re_timeout = 0; } if ((vhci_hcd->port_status[rhport] & (1 << USB_PORT_FEAT_RESET)) != 0 && time_after(jiffies, vhci_hcd->re_timeout)) { vhci_hcd->port_status[rhport] |= (1 << USB_PORT_FEAT_C_RESET); vhci_hcd->port_status[rhport] &= ~(1 << USB_PORT_FEAT_RESET); vhci_hcd->re_timeout = 0; /* * A few drivers do usb reset during probe when * the device could be in VDEV_ST_USED state */ if (vhci_hcd->vdev[rhport].ud.status == VDEV_ST_NOTASSIGNED || vhci_hcd->vdev[rhport].ud.status == VDEV_ST_USED) { usbip_dbg_vhci_rh( " enable rhport %d (status %u)\n", rhport, vhci_hcd->vdev[rhport].ud.status); vhci_hcd->port_status[rhport] |= USB_PORT_STAT_ENABLE; } if (hcd->speed < HCD_USB3) { switch (vhci_hcd->vdev[rhport].speed) { case USB_SPEED_HIGH: vhci_hcd->port_status[rhport] |= USB_PORT_STAT_HIGH_SPEED; break; case USB_SPEED_LOW: vhci_hcd->port_status[rhport] |= USB_PORT_STAT_LOW_SPEED; break; default: pr_err("vhci_device speed not set\n"); break; } } } ((__le16 *) buf)[0] = cpu_to_le16(vhci_hcd->port_status[rhport]); ((__le16 *) buf)[1] = cpu_to_le16(vhci_hcd->port_status[rhport] >> 16); usbip_dbg_vhci_rh(" GetPortStatus bye %x %x\n", ((u16 *)buf)[0], ((u16 *)buf)[1]); break; case SetHubFeature: usbip_dbg_vhci_rh(" SetHubFeature\n"); retval = -EPIPE; break; case SetPortFeature: switch (wValue) { case USB_PORT_FEAT_LINK_STATE: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_LINK_STATE\n"); if (hcd->speed < HCD_USB3) { pr_err("USB_PORT_FEAT_LINK_STATE req not " "supported for USB 2.0 roothub\n"); goto error; } /* * Since this is dummy we don't have an actual link so * there is nothing to do for the SET_LINK_STATE cmd */ break; case USB_PORT_FEAT_U1_TIMEOUT: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_U1_TIMEOUT\n"); fallthrough; case USB_PORT_FEAT_U2_TIMEOUT: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_U2_TIMEOUT\n"); /* TODO: add suspend/resume support! */ if (hcd->speed < HCD_USB3) { pr_err("USB_PORT_FEAT_U1/2_TIMEOUT req not " "supported for USB 2.0 roothub\n"); goto error; } break; case USB_PORT_FEAT_SUSPEND: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_SUSPEND\n"); /* Applicable only for USB2.0 hub */ if (hcd->speed >= HCD_USB3) { pr_err("USB_PORT_FEAT_SUSPEND req not " "supported for USB 3.0 roothub\n"); goto error; } if (invalid_rhport) { pr_err("invalid port number %d\n", wIndex); goto error; } vhci_hcd->port_status[rhport] |= USB_PORT_STAT_SUSPEND; break; case USB_PORT_FEAT_POWER: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_POWER\n"); if (invalid_rhport) { pr_err("invalid port number %d\n", wIndex); goto error; } if (hcd->speed >= HCD_USB3) vhci_hcd->port_status[rhport] |= USB_SS_PORT_STAT_POWER; else vhci_hcd->port_status[rhport] |= USB_PORT_STAT_POWER; break; case USB_PORT_FEAT_BH_PORT_RESET: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_BH_PORT_RESET\n"); if (invalid_rhport) { pr_err("invalid port number %d\n", wIndex); goto error; } /* Applicable only for USB3.0 hub */ if (hcd->speed < HCD_USB3) { pr_err("USB_PORT_FEAT_BH_PORT_RESET req not " "supported for USB 2.0 roothub\n"); goto error; } fallthrough; case USB_PORT_FEAT_RESET: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_RESET\n"); if (invalid_rhport) { pr_err("invalid port number %d\n", wIndex); goto error; } /* if it's already enabled, disable */ if (hcd->speed >= HCD_USB3) { vhci_hcd->port_status[rhport] = 0; vhci_hcd->port_status[rhport] = (USB_SS_PORT_STAT_POWER | USB_PORT_STAT_CONNECTION | USB_PORT_STAT_RESET); } else if (vhci_hcd->port_status[rhport] & USB_PORT_STAT_ENABLE) { vhci_hcd->port_status[rhport] &= ~(USB_PORT_STAT_ENABLE | USB_PORT_STAT_LOW_SPEED | USB_PORT_STAT_HIGH_SPEED); } /* 50msec reset signaling */ vhci_hcd->re_timeout = jiffies + msecs_to_jiffies(50); fallthrough; default: usbip_dbg_vhci_rh(" SetPortFeature: default %d\n", wValue); if (invalid_rhport) { pr_err("invalid port number %d\n", wIndex); goto error; } if (wValue >= 32) goto error; if (hcd->speed >= HCD_USB3) { if ((vhci_hcd->port_status[rhport] & USB_SS_PORT_STAT_POWER) != 0) { vhci_hcd->port_status[rhport] |= (1 << wValue); } } else if ((vhci_hcd->port_status[rhport] & USB_PORT_STAT_POWER) != 0) { vhci_hcd->port_status[rhport] |= (1 << wValue); } } break; case GetPortErrorCount: usbip_dbg_vhci_rh(" GetPortErrorCount\n"); if (hcd->speed < HCD_USB3) { pr_err("GetPortErrorCount req not " "supported for USB 2.0 roothub\n"); goto error; } /* We'll always return 0 since this is a dummy hub */ *(__le32 *) buf = cpu_to_le32(0); break; case SetHubDepth: usbip_dbg_vhci_rh(" SetHubDepth\n"); if (hcd->speed < HCD_USB3) { pr_err("SetHubDepth req not supported for " "USB 2.0 roothub\n"); goto error; } break; default: pr_err("default hub control req: %04x v%04x i%04x l%d\n", typeReq, wValue, wIndex, wLength); error: /* "protocol stall" on error */ retval = -EPIPE; } if (usbip_dbg_flag_vhci_rh) { pr_debug("port %d\n", rhport); /* Only dump valid port status */ if (!invalid_rhport) { dump_port_status_diff(prev_port_status[rhport], vhci_hcd->port_status[rhport], hcd->speed >= HCD_USB3); } } usbip_dbg_vhci_rh(" bye\n"); spin_unlock_irqrestore(&vhci->lock, flags); if (!invalid_rhport && (vhci_hcd->port_status[rhport] & PORT_C_MASK) != 0) { usb_hcd_poll_rh_status(hcd); } return retval; } static void vhci_tx_urb(struct urb *urb, struct vhci_device *vdev) { struct vhci_priv *priv; struct vhci_hcd *vhci_hcd = vdev_to_vhci_hcd(vdev); unsigned long flags; priv = kzalloc(sizeof(struct vhci_priv), GFP_ATOMIC); if (!priv) { usbip_event_add(&vdev->ud, VDEV_EVENT_ERROR_MALLOC); return; } spin_lock_irqsave(&vdev->priv_lock, flags); priv->seqnum = atomic_inc_return(&vhci_hcd->seqnum); if (priv->seqnum == 0xffff) dev_info(&urb->dev->dev, "seqnum max\n"); priv->vdev = vdev; priv->urb = urb; urb->hcpriv = (void *) priv; list_add_tail(&priv->list, &vdev->priv_tx); wake_up(&vdev->waitq_tx); spin_unlock_irqrestore(&vdev->priv_lock, flags); } static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags) { struct vhci_hcd *vhci_hcd = hcd_to_vhci_hcd(hcd); struct vhci *vhci = vhci_hcd->vhci; struct device *dev = &urb->dev->dev; u8 portnum = urb->dev->portnum; int ret = 0; struct vhci_device *vdev; unsigned long flags; if (portnum > VHCI_HC_PORTS) { pr_err("invalid port number %d\n", portnum); return -ENODEV; } vdev = &vhci_hcd->vdev[portnum-1]; if (!urb->transfer_buffer && !urb->num_sgs && urb->transfer_buffer_length) { dev_dbg(dev, "Null URB transfer buffer\n"); return -EINVAL; } spin_lock_irqsave(&vhci->lock, flags); if (urb->status != -EINPROGRESS) { dev_err(dev, "URB already unlinked!, status %d\n", urb->status); spin_unlock_irqrestore(&vhci->lock, flags); return urb->status; } /* refuse enqueue for dead connection */ spin_lock(&vdev->ud.lock); if (vdev->ud.status == VDEV_ST_NULL || vdev->ud.status == VDEV_ST_ERROR) { dev_err(dev, "enqueue for inactive port %d\n", vdev->rhport); spin_unlock(&vdev->ud.lock); spin_unlock_irqrestore(&vhci->lock, flags); return -ENODEV; } spin_unlock(&vdev->ud.lock); ret = usb_hcd_link_urb_to_ep(hcd, urb); if (ret) goto no_need_unlink; /* * The enumeration process is as follows; * * 1. Get_Descriptor request to DevAddrs(0) EndPoint(0) * to get max packet length of default pipe * * 2. Set_Address request to DevAddr(0) EndPoint(0) * */ if (usb_pipedevice(urb->pipe) == 0) { struct usb_device *old; __u8 type = usb_pipetype(urb->pipe); struct usb_ctrlrequest *ctrlreq = (struct usb_ctrlrequest *) urb->setup_packet; if (type != PIPE_CONTROL || !ctrlreq) { dev_err(dev, "invalid request to devnum 0\n"); ret = -EINVAL; goto no_need_xmit; } old = vdev->udev; switch (ctrlreq->bRequest) { case USB_REQ_SET_ADDRESS: /* set_address may come when a device is reset */ dev_info(dev, "SetAddress Request (%d) to port %d\n", ctrlreq->wValue, vdev->rhport); vdev->udev = usb_get_dev(urb->dev); usb_put_dev(old); spin_lock(&vdev->ud.lock); vdev->ud.status = VDEV_ST_USED; spin_unlock(&vdev->ud.lock); if (urb->status == -EINPROGRESS) { /* This request is successfully completed. */ /* If not -EINPROGRESS, possibly unlinked. */ urb->status = 0; } goto no_need_xmit; case USB_REQ_GET_DESCRIPTOR: if (ctrlreq->wValue == cpu_to_le16(USB_DT_DEVICE << 8)) usbip_dbg_vhci_hc( "Not yet?:Get_Descriptor to device 0 (get max pipe size)\n"); vdev->udev = usb_get_dev(urb->dev); usb_put_dev(old); goto out; default: /* NOT REACHED */ dev_err(dev, "invalid request to devnum 0 bRequest %u, wValue %u\n", ctrlreq->bRequest, ctrlreq->wValue); ret = -EINVAL; goto no_need_xmit; } } out: vhci_tx_urb(urb, vdev); spin_unlock_irqrestore(&vhci->lock, flags); return 0; no_need_xmit: usb_hcd_unlink_urb_from_ep(hcd, urb); no_need_unlink: spin_unlock_irqrestore(&vhci->lock, flags); if (!ret) { /* usb_hcd_giveback_urb() should be called with * irqs disabled */ local_irq_disable(); usb_hcd_giveback_urb(hcd, urb, urb->status); local_irq_enable(); } return ret; } /* * vhci_rx gives back the urb after receiving the reply of the urb. If an * unlink pdu is sent or not, vhci_rx receives a normal return pdu and gives * back its urb. For the driver unlinking the urb, the content of the urb is * not important, but the calling to its completion handler is important; the * completion of unlinking is notified by the completion handler. * * * CLIENT SIDE * * - When vhci_hcd receives RET_SUBMIT, * * - case 1a). the urb of the pdu is not unlinking. * - normal case * => just give back the urb * * - case 1b). the urb of the pdu is unlinking. * - usbip.ko will return a reply of the unlinking request. * => give back the urb now and go to case 2b). * * - When vhci_hcd receives RET_UNLINK, * * - case 2a). a submit request is still pending in vhci_hcd. * - urb was really pending in usbip.ko and urb_unlink_urb() was * completed there. * => free a pending submit request * => notify unlink completeness by giving back the urb * * - case 2b). a submit request is *not* pending in vhci_hcd. * - urb was already given back to the core driver. * => do not give back the urb * * * SERVER SIDE * * - When usbip receives CMD_UNLINK, * * - case 3a). the urb of the unlink request is now in submission. * => do usb_unlink_urb(). * => after the unlink is completed, send RET_UNLINK. * * - case 3b). the urb of the unlink request is not in submission. * - may be already completed or never be received * => send RET_UNLINK * */ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) { struct vhci_hcd *vhci_hcd = hcd_to_vhci_hcd(hcd); struct vhci *vhci = vhci_hcd->vhci; struct vhci_priv *priv; struct vhci_device *vdev; unsigned long flags; spin_lock_irqsave(&vhci->lock, flags); priv = urb->hcpriv; if (!priv) { /* URB was never linked! or will be soon given back by * vhci_rx. */ spin_unlock_irqrestore(&vhci->lock, flags); return -EIDRM; } { int ret = 0; ret = usb_hcd_check_unlink_urb(hcd, urb, status); if (ret) { spin_unlock_irqrestore(&vhci->lock, flags); return ret; } } /* send unlink request here? */ vdev = priv->vdev; if (!vdev->ud.tcp_socket) { /* tcp connection is closed */ spin_lock(&vdev->priv_lock); list_del(&priv->list); kfree(priv); urb->hcpriv = NULL; spin_unlock(&vdev->priv_lock); /* * If tcp connection is alive, we have sent CMD_UNLINK. * vhci_rx will receive RET_UNLINK and give back the URB. * Otherwise, we give back it here. */ usb_hcd_unlink_urb_from_ep(hcd, urb); spin_unlock_irqrestore(&vhci->lock, flags); usb_hcd_giveback_urb(hcd, urb, urb->status); spin_lock_irqsave(&vhci->lock, flags); } else { /* tcp connection is alive */ struct vhci_unlink *unlink; spin_lock(&vdev->priv_lock); /* setup CMD_UNLINK pdu */ unlink = kzalloc(sizeof(struct vhci_unlink), GFP_ATOMIC); if (!unlink) { spin_unlock(&vdev->priv_lock); spin_unlock_irqrestore(&vhci->lock, flags); usbip_event_add(&vdev->ud, VDEV_EVENT_ERROR_MALLOC); return -ENOMEM; } unlink->seqnum = atomic_inc_return(&vhci_hcd->seqnum); if (unlink->seqnum == 0xffff) pr_info("seqnum max\n"); unlink->unlink_seqnum = priv->seqnum; /* send cmd_unlink and try to cancel the pending URB in the * peer */ list_add_tail(&unlink->list, &vdev->unlink_tx); wake_up(&vdev->waitq_tx); spin_unlock(&vdev->priv_lock); } spin_unlock_irqrestore(&vhci->lock, flags); usbip_dbg_vhci_hc("leave\n"); return 0; } static void vhci_cleanup_unlink_list(struct vhci_device *vdev, struct list_head *unlink_list) { struct vhci_hcd *vhci_hcd = vdev_to_vhci_hcd(vdev); struct usb_hcd *hcd = vhci_hcd_to_hcd(vhci_hcd); struct vhci *vhci = vhci_hcd->vhci; struct vhci_unlink *unlink, *tmp; unsigned long flags; spin_lock_irqsave(&vhci->lock, flags); spin_lock(&vdev->priv_lock); list_for_each_entry_safe(unlink, tmp, unlink_list, list) { struct urb *urb; urb = pickup_urb_and_free_priv(vdev, unlink->unlink_seqnum); if (!urb) { list_del(&unlink->list); kfree(unlink); continue; } urb->status = -ENODEV; usb_hcd_unlink_urb_from_ep(hcd, urb); list_del(&unlink->list); spin_unlock(&vdev->priv_lock); spin_unlock_irqrestore(&vhci->lock, flags); usb_hcd_giveback_urb(hcd, urb, urb->status); spin_lock_irqsave(&vhci->lock, flags); spin_lock(&vdev->priv_lock); kfree(unlink); } spin_unlock(&vdev->priv_lock); spin_unlock_irqrestore(&vhci->lock, flags); } static void vhci_device_unlink_cleanup(struct vhci_device *vdev) { /* give back URB of unsent unlink request */ vhci_cleanup_unlink_list(vdev, &vdev->unlink_tx); /* give back URB of unanswered unlink request */ vhci_cleanup_unlink_list(vdev, &vdev->unlink_rx); } /* * The important thing is that only one context begins cleanup. * This is why error handling and cleanup become simple. * We do not want to consider race condition as possible. */ static void vhci_shutdown_connection(struct usbip_device *ud) { struct vhci_device *vdev = container_of(ud, struct vhci_device, ud); /* need this? see stub_dev.c */ if (ud->tcp_socket) { pr_debug("shutdown tcp_socket %d\n", ud->sockfd); kernel_sock_shutdown(ud->tcp_socket, SHUT_RDWR); } /* kill threads related to this sdev */ if (vdev->ud.tcp_rx) { kthread_stop_put(vdev->ud.tcp_rx); vdev->ud.tcp_rx = NULL; } if (vdev->ud.tcp_tx) { kthread_stop_put(vdev->ud.tcp_tx); vdev->ud.tcp_tx = NULL; } pr_info("stop threads\n"); /* active connection is closed */ if (vdev->ud.tcp_socket) { sockfd_put(vdev->ud.tcp_socket); vdev->ud.tcp_socket = NULL; vdev->ud.sockfd = -1; } pr_info("release socket\n"); vhci_device_unlink_cleanup(vdev); /* * rh_port_disconnect() is a trigger of ... * usb_disable_device(): * disable all the endpoints for a USB device. * usb_disable_endpoint(): * disable endpoints. pending urbs are unlinked(dequeued). * * NOTE: After calling rh_port_disconnect(), the USB device drivers of a * detached device should release used urbs in a cleanup function (i.e. * xxx_disconnect()). Therefore, vhci_hcd does not need to release * pushed urbs and their private data in this function. * * NOTE: vhci_dequeue() must be considered carefully. When shutting down * a connection, vhci_shutdown_connection() expects vhci_dequeue() * gives back pushed urbs and frees their private data by request of * the cleanup function of a USB driver. When unlinking a urb with an * active connection, vhci_dequeue() does not give back the urb which * is actually given back by vhci_rx after receiving its return pdu. * */ rh_port_disconnect(vdev); pr_info("disconnect device\n"); } static void vhci_device_reset(struct usbip_device *ud) { struct vhci_device *vdev = container_of(ud, struct vhci_device, ud); struct usb_device *old = vdev->udev; unsigned long flags; spin_lock_irqsave(&ud->lock, flags); vdev->speed = 0; vdev->devid = 0; vdev->udev = NULL; usb_put_dev(old); if (ud->tcp_socket) { sockfd_put(ud->tcp_socket); ud->tcp_socket = NULL; ud->sockfd = -1; } ud->status = VDEV_ST_NULL; spin_unlock_irqrestore(&ud->lock, flags); } static void vhci_device_unusable(struct usbip_device *ud) { unsigned long flags; spin_lock_irqsave(&ud->lock, flags); ud->status = VDEV_ST_ERROR; spin_unlock_irqrestore(&ud->lock, flags); } static void vhci_device_init(struct vhci_device *vdev) { memset(vdev, 0, sizeof(struct vhci_device)); vdev->ud.side = USBIP_VHCI; vdev->ud.status = VDEV_ST_NULL; spin_lock_init(&vdev->ud.lock); mutex_init(&vdev->ud.sysfs_lock); INIT_LIST_HEAD(&vdev->priv_rx); INIT_LIST_HEAD(&vdev->priv_tx); INIT_LIST_HEAD(&vdev->unlink_tx); INIT_LIST_HEAD(&vdev->unlink_rx); spin_lock_init(&vdev->priv_lock); init_waitqueue_head(&vdev->waitq_tx); vdev->ud.eh_ops.shutdown = vhci_shutdown_connection; vdev->ud.eh_ops.reset = vhci_device_reset; vdev->ud.eh_ops.unusable = vhci_device_unusable; usbip_start_eh(&vdev->ud); } static int hcd_name_to_id(const char *name) { char *c; long val; int ret; c = strchr(name, '.'); if (c == NULL) return 0; ret = kstrtol(c+1, 10, &val); if (ret < 0) return ret; return val; } static int vhci_setup(struct usb_hcd *hcd) { struct vhci *vhci = *((void **)dev_get_platdata(hcd->self.controller)); if (usb_hcd_is_primary_hcd(hcd)) { vhci->vhci_hcd_hs = hcd_to_vhci_hcd(hcd); vhci->vhci_hcd_hs->vhci = vhci; /* * Mark the first roothub as being USB 2.0. * The USB 3.0 roothub will be registered later by * vhci_hcd_probe() */ hcd->speed = HCD_USB2; hcd->self.root_hub->speed = USB_SPEED_HIGH; } else { vhci->vhci_hcd_ss = hcd_to_vhci_hcd(hcd); vhci->vhci_hcd_ss->vhci = vhci; hcd->speed = HCD_USB31; hcd->self.root_hub->speed = USB_SPEED_SUPER_PLUS; } /* * Support SG. * sg_tablesize is an arbitrary value to alleviate memory pressure * on the host. */ hcd->self.sg_tablesize = 32; hcd->self.no_sg_constraint = 1; return 0; } static int vhci_start(struct usb_hcd *hcd) { struct vhci_hcd *vhci_hcd = hcd_to_vhci_hcd(hcd); int id, rhport; int err; usbip_dbg_vhci_hc("enter vhci_start\n"); if (usb_hcd_is_primary_hcd(hcd)) spin_lock_init(&vhci_hcd->vhci->lock); /* initialize private data of usb_hcd */ for (rhport = 0; rhport < VHCI_HC_PORTS; rhport++) { struct vhci_device *vdev = &vhci_hcd->vdev[rhport]; vhci_device_init(vdev); vdev->rhport = rhport; } atomic_set(&vhci_hcd->seqnum, 0); hcd->power_budget = 0; /* no limit */ hcd->uses_new_polling = 1; #ifdef CONFIG_USB_OTG hcd->self.otg_port = 1; #endif id = hcd_name_to_id(hcd_name(hcd)); if (id < 0) { pr_err("invalid vhci name %s\n", hcd_name(hcd)); return -EINVAL; } /* vhci_hcd is now ready to be controlled through sysfs */ if (id == 0 && usb_hcd_is_primary_hcd(hcd)) { err = vhci_init_attr_group(); if (err) { dev_err(hcd_dev(hcd), "init attr group failed, err = %d\n", err); return err; } err = sysfs_create_group(&hcd_dev(hcd)->kobj, &vhci_attr_group); if (err) { dev_err(hcd_dev(hcd), "create sysfs files failed, err = %d\n", err); vhci_finish_attr_group(); return err; } pr_info("created sysfs %s\n", hcd_name(hcd)); } return 0; } static void vhci_stop(struct usb_hcd *hcd) { struct vhci_hcd *vhci_hcd = hcd_to_vhci_hcd(hcd); int id, rhport; usbip_dbg_vhci_hc("stop VHCI controller\n"); /* 1. remove the userland interface of vhci_hcd */ id = hcd_name_to_id(hcd_name(hcd)); if (id == 0 && usb_hcd_is_primary_hcd(hcd)) { sysfs_remove_group(&hcd_dev(hcd)->kobj, &vhci_attr_group); vhci_finish_attr_group(); } /* 2. shutdown all the ports of vhci_hcd */ for (rhport = 0; rhport < VHCI_HC_PORTS; rhport++) { struct vhci_device *vdev = &vhci_hcd->vdev[rhport]; usbip_event_add(&vdev->ud, VDEV_EVENT_REMOVED); usbip_stop_eh(&vdev->ud); } } static int vhci_get_frame_number(struct usb_hcd *hcd) { dev_err_ratelimited(&hcd->self.root_hub->dev, "Not yet implemented\n"); return 0; } #ifdef CONFIG_PM /* FIXME: suspend/resume */ static int vhci_bus_suspend(struct usb_hcd *hcd) { struct vhci *vhci = *((void **)dev_get_platdata(hcd->self.controller)); unsigned long flags; dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__); spin_lock_irqsave(&vhci->lock, flags); hcd->state = HC_STATE_SUSPENDED; spin_unlock_irqrestore(&vhci->lock, flags); return 0; } static int vhci_bus_resume(struct usb_hcd *hcd) { struct vhci *vhci = *((void **)dev_get_platdata(hcd->self.controller)); int rc = 0; unsigned long flags; dev_dbg(&hcd->self.root_hub->dev, "%s\n", __func__); spin_lock_irqsave(&vhci->lock, flags); if (!HCD_HW_ACCESSIBLE(hcd)) rc = -ESHUTDOWN; else hcd->state = HC_STATE_RUNNING; spin_unlock_irqrestore(&vhci->lock, flags); return rc; } #else #define vhci_bus_suspend NULL #define vhci_bus_resume NULL #endif /* Change a group of bulk endpoints to support multiple stream IDs */ static int vhci_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint **eps, unsigned int num_eps, unsigned int num_streams, gfp_t mem_flags) { dev_dbg(&hcd->self.root_hub->dev, "vhci_alloc_streams not implemented\n"); return 0; } /* Reverts a group of bulk endpoints back to not using stream IDs. */ static int vhci_free_streams(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint **eps, unsigned int num_eps, gfp_t mem_flags) { dev_dbg(&hcd->self.root_hub->dev, "vhci_free_streams not implemented\n"); return 0; } static const struct hc_driver vhci_hc_driver = { .description = driver_name, .product_desc = driver_desc, .hcd_priv_size = sizeof(struct vhci_hcd), .flags = HCD_USB31 | HCD_SHARED, .reset = vhci_setup, .start = vhci_start, .stop = vhci_stop, .urb_enqueue = vhci_urb_enqueue, .urb_dequeue = vhci_urb_dequeue, .get_frame_number = vhci_get_frame_number, .hub_status_data = vhci_hub_status, .hub_control = vhci_hub_control, .bus_suspend = vhci_bus_suspend, .bus_resume = vhci_bus_resume, .alloc_streams = vhci_alloc_streams, .free_streams = vhci_free_streams, }; static int vhci_hcd_probe(struct platform_device *pdev) { struct vhci *vhci = *((void **)dev_get_platdata(&pdev->dev)); struct usb_hcd *hcd_hs; struct usb_hcd *hcd_ss; int ret; usbip_dbg_vhci_hc("name %s id %d\n", pdev->name, pdev->id); /* * Allocate and initialize hcd. * Our private data is also allocated automatically. */ hcd_hs = usb_create_hcd(&vhci_hc_driver, &pdev->dev, dev_name(&pdev->dev)); if (!hcd_hs) { pr_err("create primary hcd failed\n"); return -ENOMEM; } hcd_hs->has_tt = 1; /* * Finish generic HCD structure initialization and register. * Call the driver's reset() and start() routines. */ ret = usb_add_hcd(hcd_hs, 0, 0); if (ret != 0) { pr_err("usb_add_hcd hs failed %d\n", ret); goto put_usb2_hcd; } hcd_ss = usb_create_shared_hcd(&vhci_hc_driver, &pdev->dev, dev_name(&pdev->dev), hcd_hs); if (!hcd_ss) { ret = -ENOMEM; pr_err("create shared hcd failed\n"); goto remove_usb2_hcd; } ret = usb_add_hcd(hcd_ss, 0, 0); if (ret) { pr_err("usb_add_hcd ss failed %d\n", ret); goto put_usb3_hcd; } usbip_dbg_vhci_hc("bye\n"); return 0; put_usb3_hcd: usb_put_hcd(hcd_ss); remove_usb2_hcd: usb_remove_hcd(hcd_hs); put_usb2_hcd: usb_put_hcd(hcd_hs); vhci->vhci_hcd_hs = NULL; vhci->vhci_hcd_ss = NULL; return ret; } static void vhci_hcd_remove(struct platform_device *pdev) { struct vhci *vhci = *((void **)dev_get_platdata(&pdev->dev)); /* * Disconnects the root hub, * then reverses the effects of usb_add_hcd(), * invoking the HCD's stop() methods. */ usb_remove_hcd(vhci_hcd_to_hcd(vhci->vhci_hcd_ss)); usb_put_hcd(vhci_hcd_to_hcd(vhci->vhci_hcd_ss)); usb_remove_hcd(vhci_hcd_to_hcd(vhci->vhci_hcd_hs)); usb_put_hcd(vhci_hcd_to_hcd(vhci->vhci_hcd_hs)); vhci->vhci_hcd_hs = NULL; vhci->vhci_hcd_ss = NULL; } #ifdef CONFIG_PM /* what should happen for USB/IP under suspend/resume? */ static int vhci_hcd_suspend(struct platform_device *pdev, pm_message_t state) { struct usb_hcd *hcd; struct vhci *vhci; int rhport; int connected = 0; int ret = 0; unsigned long flags; dev_dbg(&pdev->dev, "%s\n", __func__); hcd = platform_get_drvdata(pdev); if (!hcd) return 0; vhci = *((void **)dev_get_platdata(hcd->self.controller)); spin_lock_irqsave(&vhci->lock, flags); for (rhport = 0; rhport < VHCI_HC_PORTS; rhport++) { if (vhci->vhci_hcd_hs->port_status[rhport] & USB_PORT_STAT_CONNECTION) connected += 1; if (vhci->vhci_hcd_ss->port_status[rhport] & USB_PORT_STAT_CONNECTION) connected += 1; } spin_unlock_irqrestore(&vhci->lock, flags); if (connected > 0) { dev_info(&pdev->dev, "We have %d active connection%s. Do not suspend.\n", connected, (connected == 1 ? "" : "s")); ret = -EBUSY; } else { dev_info(&pdev->dev, "suspend vhci_hcd"); clear_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); } return ret; } static int vhci_hcd_resume(struct platform_device *pdev) { struct usb_hcd *hcd; dev_dbg(&pdev->dev, "%s\n", __func__); hcd = platform_get_drvdata(pdev); if (!hcd) return 0; set_bit(HCD_FLAG_HW_ACCESSIBLE, &hcd->flags); usb_hcd_poll_rh_status(hcd); return 0; } #else #define vhci_hcd_suspend NULL #define vhci_hcd_resume NULL #endif static struct platform_driver vhci_driver = { .probe = vhci_hcd_probe, .remove = vhci_hcd_remove, .suspend = vhci_hcd_suspend, .resume = vhci_hcd_resume, .driver = { .name = driver_name, }, }; static void del_platform_devices(void) { int i; for (i = 0; i < vhci_num_controllers; i++) { platform_device_unregister(vhcis[i].pdev); vhcis[i].pdev = NULL; } sysfs_remove_link(&platform_bus.kobj, driver_name); } static int __init vhci_hcd_init(void) { int i, ret; if (usb_disabled()) return -ENODEV; if (vhci_num_controllers < 1) vhci_num_controllers = 1; vhcis = kcalloc(vhci_num_controllers, sizeof(struct vhci), GFP_KERNEL); if (vhcis == NULL) return -ENOMEM; ret = platform_driver_register(&vhci_driver); if (ret) goto err_driver_register; for (i = 0; i < vhci_num_controllers; i++) { void *vhci = &vhcis[i]; struct platform_device_info pdevinfo = { .name = driver_name, .id = i, .data = &vhci, .size_data = sizeof(void *), }; vhcis[i].pdev = platform_device_register_full(&pdevinfo); ret = PTR_ERR_OR_ZERO(vhcis[i].pdev); if (ret < 0) { while (i--) platform_device_unregister(vhcis[i].pdev); goto err_add_hcd; } } return 0; err_add_hcd: platform_driver_unregister(&vhci_driver); err_driver_register: kfree(vhcis); return ret; } static void __exit vhci_hcd_exit(void) { del_platform_devices(); platform_driver_unregister(&vhci_driver); kfree(vhcis); } module_init(vhci_hcd_init); module_exit(vhci_hcd_exit); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); |
1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 | // SPDX-License-Identifier: GPL-2.0-only /* DVB USB framework compliant Linux driver for the * DVBWorld DVB-S 2101, 2102, DVB-S2 2104, DVB-C 3101, * TeVii S421, S480, S482, S600, S630, S632, S650, S660, S662, * Prof 1100, 7500, * Geniatech SU3000, T220, * TechnoTrend S2-4600, * Terratec Cinergy S2 cards * Copyright (C) 2008-2012 Igor M. Liplianin (liplianin@me.by) * * see Documentation/driver-api/media/drivers/dvb-usb.rst for more information */ #include <media/dvb-usb-ids.h> #include "dw2102.h" #include "si21xx.h" #include "stv0299.h" #include "z0194a.h" #include "stv0288.h" #include "stb6000.h" #include "eds1547.h" #include "cx24116.h" #include "tda1002x.h" #include "mt312.h" #include "zl10039.h" #include "ts2020.h" #include "ds3000.h" #include "stv0900.h" #include "stv6110.h" #include "stb6100.h" #include "stb6100_proc.h" #include "m88rs2000.h" #include "tda18271.h" #include "cxd2820r.h" #include "m88ds3103.h" /* Max transfer size done by I2C transfer functions */ #define MAX_XFER_SIZE 64 #define DW210X_READ_MSG 0 #define DW210X_WRITE_MSG 1 #define REG_1F_SYMBOLRATE_BYTE0 0x1f #define REG_20_SYMBOLRATE_BYTE1 0x20 #define REG_21_SYMBOLRATE_BYTE2 0x21 /* on my own*/ #define DW2102_VOLTAGE_CTRL (0x1800) #define SU3000_STREAM_CTRL (0x1900) #define DW2102_RC_QUERY (0x1a00) #define DW2102_LED_CTRL (0x1b00) #define DW2101_FIRMWARE "dvb-usb-dw2101.fw" #define DW2102_FIRMWARE "dvb-usb-dw2102.fw" #define DW2104_FIRMWARE "dvb-usb-dw2104.fw" #define DW3101_FIRMWARE "dvb-usb-dw3101.fw" #define S630_FIRMWARE "dvb-usb-s630.fw" #define S660_FIRMWARE "dvb-usb-s660.fw" #define P1100_FIRMWARE "dvb-usb-p1100.fw" #define P7500_FIRMWARE "dvb-usb-p7500.fw" #define err_str "did not find the firmware file '%s'. You can use <kernel_dir>/scripts/get_dvb_firmware to get the firmware" struct dw2102_state { u8 initialized; u8 last_lock; u8 data[MAX_XFER_SIZE + 4]; struct i2c_client *i2c_client_demod; struct i2c_client *i2c_client_tuner; /* fe hook functions*/ int (*old_set_voltage)(struct dvb_frontend *f, enum fe_sec_voltage v); int (*fe_read_status)(struct dvb_frontend *fe, enum fe_status *status); }; /* debug */ static int dvb_usb_dw2102_debug; module_param_named(debug, dvb_usb_dw2102_debug, int, 0644); MODULE_PARM_DESC(debug, "set debugging level (1=info 2=xfer 4=rc(or-able))." DVB_USB_DEBUG_STATUS); /* demod probe */ static int demod_probe = 1; module_param_named(demod, demod_probe, int, 0644); MODULE_PARM_DESC(demod, "demod to probe (1=cx24116 2=stv0903+stv6110 4=stv0903+stb6100(or-able))."); DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); static int dw210x_op_rw(struct usb_device *dev, u8 request, u16 value, u16 index, u8 *data, u16 len, int flags) { int ret; u8 *u8buf; unsigned int pipe = (flags == DW210X_READ_MSG) ? usb_rcvctrlpipe(dev, 0) : usb_sndctrlpipe(dev, 0); u8 request_type = (flags == DW210X_READ_MSG) ? USB_DIR_IN : USB_DIR_OUT; u8buf = kmalloc(len, GFP_KERNEL); if (!u8buf) return -ENOMEM; if (flags == DW210X_WRITE_MSG) memcpy(u8buf, data, len); ret = usb_control_msg(dev, pipe, request, request_type | USB_TYPE_VENDOR, value, index, u8buf, len, 2000); if (flags == DW210X_READ_MSG) memcpy(data, u8buf, len); kfree(u8buf); return ret; } /* I2C */ static int dw2102_i2c_transfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); int i = 0; u8 buf6[] = {0x2c, 0x05, 0xc0, 0, 0, 0, 0}; u16 value; if (!d) return -ENODEV; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; switch (num) { case 2: if (msg[0].len < 1) { num = -EOPNOTSUPP; break; } /* read stv0299 register */ value = msg[0].buf[0];/* register */ for (i = 0; i < msg[1].len; i++) { dw210x_op_rw(d->udev, 0xb5, value + i, 0, buf6, 2, DW210X_READ_MSG); msg[1].buf[i] = buf6[0]; } break; case 1: switch (msg[0].addr) { case 0x68: if (msg[0].len < 2) { num = -EOPNOTSUPP; break; } /* write to stv0299 register */ buf6[0] = 0x2a; buf6[1] = msg[0].buf[0]; buf6[2] = msg[0].buf[1]; dw210x_op_rw(d->udev, 0xb2, 0, 0, buf6, 3, DW210X_WRITE_MSG); break; case 0x60: if (msg[0].flags == 0) { if (msg[0].len < 4) { num = -EOPNOTSUPP; break; } /* write to tuner pll */ buf6[0] = 0x2c; buf6[1] = 5; buf6[2] = 0xc0; buf6[3] = msg[0].buf[0]; buf6[4] = msg[0].buf[1]; buf6[5] = msg[0].buf[2]; buf6[6] = msg[0].buf[3]; dw210x_op_rw(d->udev, 0xb2, 0, 0, buf6, 7, DW210X_WRITE_MSG); } else { if (msg[0].len < 1) { num = -EOPNOTSUPP; break; } /* read from tuner */ dw210x_op_rw(d->udev, 0xb5, 0, 0, buf6, 1, DW210X_READ_MSG); msg[0].buf[0] = buf6[0]; } break; case (DW2102_RC_QUERY): if (msg[0].len < 2) { num = -EOPNOTSUPP; break; } dw210x_op_rw(d->udev, 0xb8, 0, 0, buf6, 2, DW210X_READ_MSG); msg[0].buf[0] = buf6[0]; msg[0].buf[1] = buf6[1]; break; case (DW2102_VOLTAGE_CTRL): if (msg[0].len < 1) { num = -EOPNOTSUPP; break; } buf6[0] = 0x30; buf6[1] = msg[0].buf[0]; dw210x_op_rw(d->udev, 0xb2, 0, 0, buf6, 2, DW210X_WRITE_MSG); break; } break; } mutex_unlock(&d->i2c_mutex); return num; } static int dw2102_serit_i2c_transfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); u8 buf6[] = {0, 0, 0, 0, 0, 0, 0}; if (!d) return -ENODEV; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; switch (num) { case 2: if (msg[0].len != 1) { warn("i2c rd: len=%d is not 1!\n", msg[0].len); num = -EOPNOTSUPP; break; } if (2 + msg[1].len > sizeof(buf6)) { warn("i2c rd: len=%d is too big!\n", msg[1].len); num = -EOPNOTSUPP; break; } /* read si2109 register by number */ buf6[0] = msg[0].addr << 1; buf6[1] = msg[0].len; buf6[2] = msg[0].buf[0]; dw210x_op_rw(d->udev, 0xc2, 0, 0, buf6, msg[0].len + 2, DW210X_WRITE_MSG); /* read si2109 register */ dw210x_op_rw(d->udev, 0xc3, 0xd0, 0, buf6, msg[1].len + 2, DW210X_READ_MSG); memcpy(msg[1].buf, buf6 + 2, msg[1].len); break; case 1: switch (msg[0].addr) { case 0x68: if (2 + msg[0].len > sizeof(buf6)) { warn("i2c wr: len=%d is too big!\n", msg[0].len); num = -EOPNOTSUPP; break; } /* write to si2109 register */ buf6[0] = msg[0].addr << 1; buf6[1] = msg[0].len; memcpy(buf6 + 2, msg[0].buf, msg[0].len); dw210x_op_rw(d->udev, 0xc2, 0, 0, buf6, msg[0].len + 2, DW210X_WRITE_MSG); break; case(DW2102_RC_QUERY): dw210x_op_rw(d->udev, 0xb8, 0, 0, buf6, 2, DW210X_READ_MSG); msg[0].buf[0] = buf6[0]; msg[0].buf[1] = buf6[1]; break; case(DW2102_VOLTAGE_CTRL): buf6[0] = 0x30; buf6[1] = msg[0].buf[0]; dw210x_op_rw(d->udev, 0xb2, 0, 0, buf6, 2, DW210X_WRITE_MSG); break; } break; } mutex_unlock(&d->i2c_mutex); return num; } static int dw2102_earda_i2c_transfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); int ret; if (!d) return -ENODEV; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; switch (num) { case 2: { /* read */ /* first write first register number */ u8 ibuf[MAX_XFER_SIZE], obuf[3]; if (2 + msg[0].len != sizeof(obuf)) { warn("i2c rd: len=%d is not 1!\n", msg[0].len); ret = -EOPNOTSUPP; goto unlock; } if (2 + msg[1].len > sizeof(ibuf)) { warn("i2c rd: len=%d is too big!\n", msg[1].len); ret = -EOPNOTSUPP; goto unlock; } obuf[0] = msg[0].addr << 1; obuf[1] = msg[0].len; obuf[2] = msg[0].buf[0]; dw210x_op_rw(d->udev, 0xc2, 0, 0, obuf, msg[0].len + 2, DW210X_WRITE_MSG); /* second read registers */ dw210x_op_rw(d->udev, 0xc3, 0xd1, 0, ibuf, msg[1].len + 2, DW210X_READ_MSG); memcpy(msg[1].buf, ibuf + 2, msg[1].len); break; } case 1: switch (msg[0].addr) { case 0x68: { /* write to register */ u8 obuf[MAX_XFER_SIZE]; if (2 + msg[0].len > sizeof(obuf)) { warn("i2c wr: len=%d is too big!\n", msg[1].len); ret = -EOPNOTSUPP; goto unlock; } obuf[0] = msg[0].addr << 1; obuf[1] = msg[0].len; memcpy(obuf + 2, msg[0].buf, msg[0].len); dw210x_op_rw(d->udev, 0xc2, 0, 0, obuf, msg[0].len + 2, DW210X_WRITE_MSG); break; } case 0x61: { /* write to tuner */ u8 obuf[MAX_XFER_SIZE]; if (2 + msg[0].len > sizeof(obuf)) { warn("i2c wr: len=%d is too big!\n", msg[1].len); ret = -EOPNOTSUPP; goto unlock; } obuf[0] = msg[0].addr << 1; obuf[1] = msg[0].len; memcpy(obuf + 2, msg[0].buf, msg[0].len); dw210x_op_rw(d->udev, 0xc2, 0, 0, obuf, msg[0].len + 2, DW210X_WRITE_MSG); break; } case(DW2102_RC_QUERY): { u8 ibuf[2]; dw210x_op_rw(d->udev, 0xb8, 0, 0, ibuf, 2, DW210X_READ_MSG); memcpy(msg[0].buf, ibuf, 2); break; } case(DW2102_VOLTAGE_CTRL): { u8 obuf[2]; obuf[0] = 0x30; obuf[1] = msg[0].buf[0]; dw210x_op_rw(d->udev, 0xb2, 0, 0, obuf, 2, DW210X_WRITE_MSG); break; } } break; } ret = num; unlock: mutex_unlock(&d->i2c_mutex); return ret; } static int dw2104_i2c_transfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); int len, i, j, ret; if (!d) return -ENODEV; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; for (j = 0; j < num; j++) { switch (msg[j].addr) { case(DW2102_RC_QUERY): { u8 ibuf[2]; dw210x_op_rw(d->udev, 0xb8, 0, 0, ibuf, 2, DW210X_READ_MSG); memcpy(msg[j].buf, ibuf, 2); break; } case(DW2102_VOLTAGE_CTRL): { u8 obuf[2]; obuf[0] = 0x30; obuf[1] = msg[j].buf[0]; dw210x_op_rw(d->udev, 0xb2, 0, 0, obuf, 2, DW210X_WRITE_MSG); break; } /* case 0x55: cx24116 * case 0x6a: stv0903 * case 0x68: ds3000, stv0903 * case 0x60: ts2020, stv6110, stb6100 */ default: { if (msg[j].flags == I2C_M_RD) { /* read registers */ u8 ibuf[MAX_XFER_SIZE]; if (2 + msg[j].len > sizeof(ibuf)) { warn("i2c rd: len=%d is too big!\n", msg[j].len); ret = -EOPNOTSUPP; goto unlock; } dw210x_op_rw(d->udev, 0xc3, (msg[j].addr << 1) + 1, 0, ibuf, msg[j].len + 2, DW210X_READ_MSG); memcpy(msg[j].buf, ibuf + 2, msg[j].len); mdelay(10); } else if (((msg[j].buf[0] == 0xb0) && (msg[j].addr == 0x68)) || ((msg[j].buf[0] == 0xf7) && (msg[j].addr == 0x55))) { /* write firmware */ u8 obuf[19]; obuf[0] = msg[j].addr << 1; obuf[1] = (msg[j].len > 15 ? 17 : msg[j].len); obuf[2] = msg[j].buf[0]; len = msg[j].len - 1; i = 1; do { memcpy(obuf + 3, msg[j].buf + i, (len > 16 ? 16 : len)); dw210x_op_rw(d->udev, 0xc2, 0, 0, obuf, (len > 16 ? 16 : len) + 3, DW210X_WRITE_MSG); i += 16; len -= 16; } while (len > 0); } else { /* write registers */ u8 obuf[MAX_XFER_SIZE]; if (2 + msg[j].len > sizeof(obuf)) { warn("i2c wr: len=%d is too big!\n", msg[j].len); ret = -EOPNOTSUPP; goto unlock; } obuf[0] = msg[j].addr << 1; obuf[1] = msg[j].len; memcpy(obuf + 2, msg[j].buf, msg[j].len); dw210x_op_rw(d->udev, 0xc2, 0, 0, obuf, msg[j].len + 2, DW210X_WRITE_MSG); } break; } } } ret = num; unlock: mutex_unlock(&d->i2c_mutex); return ret; } static int dw3101_i2c_transfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); int ret; int i; if (!d) return -ENODEV; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; switch (num) { case 2: { /* read */ /* first write first register number */ u8 ibuf[MAX_XFER_SIZE], obuf[3]; if (2 + msg[0].len != sizeof(obuf)) { warn("i2c rd: len=%d is not 1!\n", msg[0].len); ret = -EOPNOTSUPP; goto unlock; } if (2 + msg[1].len > sizeof(ibuf)) { warn("i2c rd: len=%d is too big!\n", msg[1].len); ret = -EOPNOTSUPP; goto unlock; } obuf[0] = msg[0].addr << 1; obuf[1] = msg[0].len; obuf[2] = msg[0].buf[0]; dw210x_op_rw(d->udev, 0xc2, 0, 0, obuf, msg[0].len + 2, DW210X_WRITE_MSG); /* second read registers */ dw210x_op_rw(d->udev, 0xc3, 0x19, 0, ibuf, msg[1].len + 2, DW210X_READ_MSG); memcpy(msg[1].buf, ibuf + 2, msg[1].len); break; } case 1: switch (msg[0].addr) { case 0x60: case 0x0c: { /* write to register */ u8 obuf[MAX_XFER_SIZE]; if (2 + msg[0].len > sizeof(obuf)) { warn("i2c wr: len=%d is too big!\n", msg[0].len); ret = -EOPNOTSUPP; goto unlock; } obuf[0] = msg[0].addr << 1; obuf[1] = msg[0].len; memcpy(obuf + 2, msg[0].buf, msg[0].len); dw210x_op_rw(d->udev, 0xc2, 0, 0, obuf, msg[0].len + 2, DW210X_WRITE_MSG); break; } case(DW2102_RC_QUERY): { u8 ibuf[2]; dw210x_op_rw(d->udev, 0xb8, 0, 0, ibuf, 2, DW210X_READ_MSG); memcpy(msg[0].buf, ibuf, 2); break; } } break; } for (i = 0; i < num; i++) { deb_xfer("%02x:%02x: %s ", i, msg[i].addr, msg[i].flags == 0 ? ">>>" : "<<<"); debug_dump(msg[i].buf, msg[i].len, deb_xfer); } ret = num; unlock: mutex_unlock(&d->i2c_mutex); return ret; } static int s6x0_i2c_transfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); struct usb_device *udev; int len, i, j, ret; if (!d) return -ENODEV; udev = d->udev; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; for (j = 0; j < num; j++) { switch (msg[j].addr) { case (DW2102_RC_QUERY): { u8 ibuf[5]; dw210x_op_rw(d->udev, 0xb8, 0, 0, ibuf, 5, DW210X_READ_MSG); memcpy(msg[j].buf, ibuf + 3, 2); break; } case (DW2102_VOLTAGE_CTRL): { u8 obuf[2]; obuf[0] = 1; obuf[1] = msg[j].buf[1];/* off-on */ dw210x_op_rw(d->udev, 0x8a, 0, 0, obuf, 2, DW210X_WRITE_MSG); obuf[0] = 3; obuf[1] = msg[j].buf[0];/* 13v-18v */ dw210x_op_rw(d->udev, 0x8a, 0, 0, obuf, 2, DW210X_WRITE_MSG); break; } case (DW2102_LED_CTRL): { u8 obuf[2]; obuf[0] = 5; obuf[1] = msg[j].buf[0]; dw210x_op_rw(d->udev, 0x8a, 0, 0, obuf, 2, DW210X_WRITE_MSG); break; } /* case 0x55: cx24116 * case 0x6a: stv0903 * case 0x68: ds3000, stv0903, rs2000 * case 0x60: ts2020, stv6110, stb6100 * case 0xa0: eeprom */ default: { if (msg[j].flags == I2C_M_RD) { /* read registers */ u8 ibuf[MAX_XFER_SIZE]; if (msg[j].len > sizeof(ibuf)) { warn("i2c rd: len=%d is too big!\n", msg[j].len); ret = -EOPNOTSUPP; goto unlock; } dw210x_op_rw(d->udev, 0x91, 0, 0, ibuf, msg[j].len, DW210X_READ_MSG); memcpy(msg[j].buf, ibuf, msg[j].len); break; } else if ((msg[j].buf[0] == 0xb0) && (msg[j].addr == 0x68)) { /* write firmware */ u8 obuf[19]; obuf[0] = (msg[j].len > 16 ? 18 : msg[j].len + 1); obuf[1] = msg[j].addr << 1; obuf[2] = msg[j].buf[0]; len = msg[j].len - 1; i = 1; do { memcpy(obuf + 3, msg[j].buf + i, (len > 16 ? 16 : len)); dw210x_op_rw(d->udev, 0x80, 0, 0, obuf, (len > 16 ? 16 : len) + 3, DW210X_WRITE_MSG); i += 16; len -= 16; } while (len > 0); } else if (j < (num - 1)) { /* write register addr before read */ u8 obuf[MAX_XFER_SIZE]; if (2 + msg[j].len > sizeof(obuf)) { warn("i2c wr: len=%d is too big!\n", msg[j].len); ret = -EOPNOTSUPP; goto unlock; } obuf[0] = msg[j + 1].len; obuf[1] = (msg[j].addr << 1); memcpy(obuf + 2, msg[j].buf, msg[j].len); dw210x_op_rw(d->udev, le16_to_cpu(udev->descriptor.idProduct) == 0x7500 ? 0x92 : 0x90, 0, 0, obuf, msg[j].len + 2, DW210X_WRITE_MSG); break; } else { /* write registers */ u8 obuf[MAX_XFER_SIZE]; if (2 + msg[j].len > sizeof(obuf)) { warn("i2c wr: len=%d is too big!\n", msg[j].len); ret = -EOPNOTSUPP; goto unlock; } obuf[0] = msg[j].len + 1; obuf[1] = (msg[j].addr << 1); memcpy(obuf + 2, msg[j].buf, msg[j].len); dw210x_op_rw(d->udev, 0x80, 0, 0, obuf, msg[j].len + 2, DW210X_WRITE_MSG); break; } break; } } } ret = num; unlock: mutex_unlock(&d->i2c_mutex); return ret; } static int su3000_i2c_transfer(struct i2c_adapter *adap, struct i2c_msg msg[], int num) { struct dvb_usb_device *d = i2c_get_adapdata(adap); struct dw2102_state *state; int j; if (!d) return -ENODEV; state = d->priv; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; if (mutex_lock_interruptible(&d->data_mutex) < 0) { mutex_unlock(&d->i2c_mutex); return -EAGAIN; } j = 0; while (j < num) { switch (msg[j].addr) { case SU3000_STREAM_CTRL: state->data[0] = msg[j].buf[0] + 0x36; state->data[1] = 3; state->data[2] = 0; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 0, 0) < 0) err("i2c transfer failed."); break; case DW2102_RC_QUERY: state->data[0] = 0x10; if (dvb_usb_generic_rw(d, state->data, 1, state->data, 2, 0) < 0) err("i2c transfer failed."); msg[j].buf[1] = state->data[0]; msg[j].buf[0] = state->data[1]; break; default: /* if the current write msg is followed by a another * read msg to/from the same address */ if ((j + 1 < num) && (msg[j + 1].flags & I2C_M_RD) && (msg[j].addr == msg[j + 1].addr)) { /* join both i2c msgs to one usb read command */ if (4 + msg[j].len > sizeof(state->data)) { warn("i2c combined wr/rd: write len=%d is too big!\n", msg[j].len); num = -EOPNOTSUPP; break; } if (1 + msg[j + 1].len > sizeof(state->data)) { warn("i2c combined wr/rd: read len=%d is too big!\n", msg[j + 1].len); num = -EOPNOTSUPP; break; } state->data[0] = 0x09; state->data[1] = msg[j].len; state->data[2] = msg[j + 1].len; state->data[3] = msg[j].addr; memcpy(&state->data[4], msg[j].buf, msg[j].len); if (dvb_usb_generic_rw(d, state->data, msg[j].len + 4, state->data, msg[j + 1].len + 1, 0) < 0) err("i2c transfer failed."); memcpy(msg[j + 1].buf, &state->data[1], msg[j + 1].len); j++; break; } if (msg[j].flags & I2C_M_RD) { /* single read */ if (4 + msg[j].len > sizeof(state->data)) { warn("i2c rd: len=%d is too big!\n", msg[j].len); num = -EOPNOTSUPP; break; } state->data[0] = 0x09; state->data[1] = 0; state->data[2] = msg[j].len; state->data[3] = msg[j].addr; memcpy(&state->data[4], msg[j].buf, msg[j].len); if (dvb_usb_generic_rw(d, state->data, 4, state->data, msg[j].len + 1, 0) < 0) err("i2c transfer failed."); memcpy(msg[j].buf, &state->data[1], msg[j].len); break; } /* single write */ if (3 + msg[j].len > sizeof(state->data)) { warn("i2c wr: len=%d is too big!\n", msg[j].len); num = -EOPNOTSUPP; break; } state->data[0] = 0x08; state->data[1] = msg[j].addr; state->data[2] = msg[j].len; memcpy(&state->data[3], msg[j].buf, msg[j].len); if (dvb_usb_generic_rw(d, state->data, msg[j].len + 3, state->data, 1, 0) < 0) err("i2c transfer failed."); } // switch j++; } // while mutex_unlock(&d->data_mutex); mutex_unlock(&d->i2c_mutex); return num; } static u32 dw210x_i2c_func(struct i2c_adapter *adapter) { return I2C_FUNC_I2C; } static struct i2c_algorithm dw2102_i2c_algo = { .master_xfer = dw2102_i2c_transfer, .functionality = dw210x_i2c_func, }; static struct i2c_algorithm dw2102_serit_i2c_algo = { .master_xfer = dw2102_serit_i2c_transfer, .functionality = dw210x_i2c_func, }; static struct i2c_algorithm dw2102_earda_i2c_algo = { .master_xfer = dw2102_earda_i2c_transfer, .functionality = dw210x_i2c_func, }; static struct i2c_algorithm dw2104_i2c_algo = { .master_xfer = dw2104_i2c_transfer, .functionality = dw210x_i2c_func, }; static struct i2c_algorithm dw3101_i2c_algo = { .master_xfer = dw3101_i2c_transfer, .functionality = dw210x_i2c_func, }; static struct i2c_algorithm s6x0_i2c_algo = { .master_xfer = s6x0_i2c_transfer, .functionality = dw210x_i2c_func, }; static struct i2c_algorithm su3000_i2c_algo = { .master_xfer = su3000_i2c_transfer, .functionality = dw210x_i2c_func, }; static int dw210x_read_mac_address(struct dvb_usb_device *d, u8 mac[6]) { int i; u8 ibuf[] = {0, 0}; u8 eeprom[256], eepromline[16]; for (i = 0; i < 256; i++) { if (dw210x_op_rw(d->udev, 0xb6, 0xa0, i, ibuf, 2, DW210X_READ_MSG) < 0) { err("read eeprom failed."); return -EIO; } else { eepromline[i % 16] = ibuf[0]; eeprom[i] = ibuf[0]; } if ((i % 16) == 15) { deb_xfer("%02x: ", i - 15); debug_dump(eepromline, 16, deb_xfer); } } memcpy(mac, eeprom + 8, 6); return 0; }; static int s6x0_read_mac_address(struct dvb_usb_device *d, u8 mac[6]) { int i, ret; u8 ibuf[] = { 0 }, obuf[] = { 0 }; u8 eeprom[256], eepromline[16]; struct i2c_msg msg[] = { { .addr = 0xa0 >> 1, .flags = 0, .buf = obuf, .len = 1, }, { .addr = 0xa0 >> 1, .flags = I2C_M_RD, .buf = ibuf, .len = 1, } }; for (i = 0; i < 256; i++) { obuf[0] = i; ret = s6x0_i2c_transfer(&d->i2c_adap, msg, 2); if (ret != 2) { err("read eeprom failed."); return -EIO; } else { eepromline[i % 16] = ibuf[0]; eeprom[i] = ibuf[0]; } if ((i % 16) == 15) { deb_xfer("%02x: ", i - 15); debug_dump(eepromline, 16, deb_xfer); } } memcpy(mac, eeprom + 16, 6); return 0; }; static int su3000_streaming_ctrl(struct dvb_usb_adapter *adap, int onoff) { static u8 command_start[] = {0x00}; static u8 command_stop[] = {0x01}; struct i2c_msg msg = { .addr = SU3000_STREAM_CTRL, .flags = 0, .buf = onoff ? command_start : command_stop, .len = 1 }; i2c_transfer(&adap->dev->i2c_adap, &msg, 1); return 0; } static int su3000_power_ctrl(struct dvb_usb_device *d, int i) { struct dw2102_state *state = d->priv; int ret = 0; info("%s: %d, initialized %d", __func__, i, state->initialized); if (i && !state->initialized) { mutex_lock(&d->data_mutex); state->data[0] = 0xde; state->data[1] = 0; state->initialized = 1; /* reset board */ ret = dvb_usb_generic_rw(d, state->data, 2, NULL, 0, 0); mutex_unlock(&d->data_mutex); } return ret; } static int su3000_read_mac_address(struct dvb_usb_device *d, u8 mac[6]) { int i; u8 obuf[] = { 0x1f, 0xf0 }; u8 ibuf[] = { 0 }; struct i2c_msg msg[] = { { .addr = 0x51, .flags = 0, .buf = obuf, .len = 2, }, { .addr = 0x51, .flags = I2C_M_RD, .buf = ibuf, .len = 1, } }; for (i = 0; i < 6; i++) { obuf[1] = 0xf0 + i; if (i2c_transfer(&d->i2c_adap, msg, 2) != 2) return -EIO; else mac[i] = ibuf[0]; } return 0; } static int su3000_identify_state(struct usb_device *udev, const struct dvb_usb_device_properties *props, const struct dvb_usb_device_description **desc, int *cold) { *cold = 0; return 0; } static int dw210x_set_voltage(struct dvb_frontend *fe, enum fe_sec_voltage voltage) { static u8 command_13v[] = {0x00, 0x01}; static u8 command_18v[] = {0x01, 0x01}; static u8 command_off[] = {0x00, 0x00}; struct i2c_msg msg = { .addr = DW2102_VOLTAGE_CTRL, .flags = 0, .buf = command_off, .len = 2, }; struct dvb_usb_adapter *udev_adap = fe->dvb->priv; if (voltage == SEC_VOLTAGE_18) msg.buf = command_18v; else if (voltage == SEC_VOLTAGE_13) msg.buf = command_13v; i2c_transfer(&udev_adap->dev->i2c_adap, &msg, 1); return 0; } static int s660_set_voltage(struct dvb_frontend *fe, enum fe_sec_voltage voltage) { struct dvb_usb_adapter *d = fe->dvb->priv; struct dw2102_state *st = d->dev->priv; dw210x_set_voltage(fe, voltage); if (st->old_set_voltage) st->old_set_voltage(fe, voltage); return 0; } static void dw210x_led_ctrl(struct dvb_frontend *fe, int offon) { static u8 led_off[] = { 0 }; static u8 led_on[] = { 1 }; struct i2c_msg msg = { .addr = DW2102_LED_CTRL, .flags = 0, .buf = led_off, .len = 1 }; struct dvb_usb_adapter *udev_adap = fe->dvb->priv; if (offon) msg.buf = led_on; i2c_transfer(&udev_adap->dev->i2c_adap, &msg, 1); } static int tt_s2_4600_read_status(struct dvb_frontend *fe, enum fe_status *status) { struct dvb_usb_adapter *d = fe->dvb->priv; struct dw2102_state *st = d->dev->priv; int ret; ret = st->fe_read_status(fe, status); /* resync slave fifo when signal change from unlock to lock */ if ((*status & FE_HAS_LOCK) && (!st->last_lock)) su3000_streaming_ctrl(d, 1); st->last_lock = (*status & FE_HAS_LOCK) ? 1 : 0; return ret; } static struct stv0299_config sharp_z0194a_config = { .demod_address = 0x68, .inittab = sharp_z0194a_inittab, .mclk = 88000000UL, .invert = 1, .skip_reinit = 0, .lock_output = STV0299_LOCKOUTPUT_1, .volt13_op0_op1 = STV0299_VOLT13_OP1, .min_delay_ms = 100, .set_symbol_rate = sharp_z0194a_set_symbol_rate, }; static struct cx24116_config dw2104_config = { .demod_address = 0x55, .mpg_clk_pos_pol = 0x01, }; static struct si21xx_config serit_sp1511lhb_config = { .demod_address = 0x68, .min_delay_ms = 100, }; static struct tda10023_config dw3101_tda10023_config = { .demod_address = 0x0c, .invert = 1, }; static struct mt312_config zl313_config = { .demod_address = 0x0e, }; static struct ds3000_config dw2104_ds3000_config = { .demod_address = 0x68, }; static struct ts2020_config dw2104_ts2020_config = { .tuner_address = 0x60, .clk_out_div = 1, .frequency_div = 1060000, }; static struct ds3000_config s660_ds3000_config = { .demod_address = 0x68, .ci_mode = 1, .set_lock_led = dw210x_led_ctrl, }; static struct ts2020_config s660_ts2020_config = { .tuner_address = 0x60, .clk_out_div = 1, .frequency_div = 1146000, }; static struct stv0900_config dw2104a_stv0900_config = { .demod_address = 0x6a, .demod_mode = 0, .xtal = 27000000, .clkmode = 3,/* 0-CLKI, 2-XTALI, else AUTO */ .diseqc_mode = 2,/* 2/3 PWM */ .tun1_maddress = 0,/* 0x60 */ .tun1_adc = 0,/* 2 Vpp */ .path1_mode = 3, }; static struct stb6100_config dw2104a_stb6100_config = { .tuner_address = 0x60, .refclock = 27000000, }; static struct stv0900_config dw2104_stv0900_config = { .demod_address = 0x68, .demod_mode = 0, .xtal = 8000000, .clkmode = 3, .diseqc_mode = 2, .tun1_maddress = 0, .tun1_adc = 1,/* 1 Vpp */ .path1_mode = 3, }; static struct stv6110_config dw2104_stv6110_config = { .i2c_address = 0x60, .mclk = 16000000, .clk_div = 1, }; static struct stv0900_config prof_7500_stv0900_config = { .demod_address = 0x6a, .demod_mode = 0, .xtal = 27000000, .clkmode = 3,/* 0-CLKI, 2-XTALI, else AUTO */ .diseqc_mode = 2,/* 2/3 PWM */ .tun1_maddress = 0,/* 0x60 */ .tun1_adc = 0,/* 2 Vpp */ .path1_mode = 3, .tun1_type = 3, .set_lock_led = dw210x_led_ctrl, }; static struct ds3000_config su3000_ds3000_config = { .demod_address = 0x68, .ci_mode = 1, .set_lock_led = dw210x_led_ctrl, }; static struct cxd2820r_config cxd2820r_config = { .i2c_address = 0x6c, /* (0xd8 >> 1) */ .ts_mode = 0x38, .ts_clock_inv = 1, }; static struct tda18271_config tda18271_config = { .output_opt = TDA18271_OUTPUT_LT_OFF, .gate = TDA18271_GATE_DIGITAL, }; static u8 m88rs2000_inittab[] = { DEMOD_WRITE, 0x9a, 0x30, DEMOD_WRITE, 0x00, 0x01, WRITE_DELAY, 0x19, 0x00, DEMOD_WRITE, 0x00, 0x00, DEMOD_WRITE, 0x9a, 0xb0, DEMOD_WRITE, 0x81, 0xc1, DEMOD_WRITE, 0x81, 0x81, DEMOD_WRITE, 0x86, 0xc6, DEMOD_WRITE, 0x9a, 0x30, DEMOD_WRITE, 0xf0, 0x80, DEMOD_WRITE, 0xf1, 0xbf, DEMOD_WRITE, 0xb0, 0x45, DEMOD_WRITE, 0xb2, 0x01, DEMOD_WRITE, 0x9a, 0xb0, 0xff, 0xaa, 0xff }; static struct m88rs2000_config s421_m88rs2000_config = { .demod_addr = 0x68, .inittab = m88rs2000_inittab, }; static int dw2104_frontend_attach(struct dvb_usb_adapter *d) { struct dvb_tuner_ops *tuner_ops = NULL; if (demod_probe & 4) { d->fe_adap[0].fe = dvb_attach(stv0900_attach, &dw2104a_stv0900_config, &d->dev->i2c_adap, 0); if (d->fe_adap[0].fe) { if (dvb_attach(stb6100_attach, d->fe_adap[0].fe, &dw2104a_stb6100_config, &d->dev->i2c_adap)) { tuner_ops = &d->fe_adap[0].fe->ops.tuner_ops; tuner_ops->set_frequency = stb6100_set_freq; tuner_ops->get_frequency = stb6100_get_freq; tuner_ops->set_bandwidth = stb6100_set_bandw; tuner_ops->get_bandwidth = stb6100_get_bandw; d->fe_adap[0].fe->ops.set_voltage = dw210x_set_voltage; info("Attached STV0900+STB6100!"); return 0; } } } if (demod_probe & 2) { d->fe_adap[0].fe = dvb_attach(stv0900_attach, &dw2104_stv0900_config, &d->dev->i2c_adap, 0); if (d->fe_adap[0].fe) { if (dvb_attach(stv6110_attach, d->fe_adap[0].fe, &dw2104_stv6110_config, &d->dev->i2c_adap)) { d->fe_adap[0].fe->ops.set_voltage = dw210x_set_voltage; info("Attached STV0900+STV6110A!"); return 0; } } } if (demod_probe & 1) { d->fe_adap[0].fe = dvb_attach(cx24116_attach, &dw2104_config, &d->dev->i2c_adap); if (d->fe_adap[0].fe) { d->fe_adap[0].fe->ops.set_voltage = dw210x_set_voltage; info("Attached cx24116!"); return 0; } } d->fe_adap[0].fe = dvb_attach(ds3000_attach, &dw2104_ds3000_config, &d->dev->i2c_adap); if (d->fe_adap[0].fe) { dvb_attach(ts2020_attach, d->fe_adap[0].fe, &dw2104_ts2020_config, &d->dev->i2c_adap); d->fe_adap[0].fe->ops.set_voltage = dw210x_set_voltage; info("Attached DS3000!"); return 0; } return -EIO; } static struct dvb_usb_device_properties dw2102_properties; static struct dvb_usb_device_properties dw2104_properties; static struct dvb_usb_device_properties s6x0_properties; static int dw2102_frontend_attach(struct dvb_usb_adapter *d) { if (dw2102_properties.i2c_algo == &dw2102_serit_i2c_algo) { /*dw2102_properties.adapter->tuner_attach = NULL;*/ d->fe_adap[0].fe = dvb_attach(si21xx_attach, &serit_sp1511lhb_config, &d->dev->i2c_adap); if (d->fe_adap[0].fe) { d->fe_adap[0].fe->ops.set_voltage = dw210x_set_voltage; info("Attached si21xx!"); return 0; } } if (dw2102_properties.i2c_algo == &dw2102_earda_i2c_algo) { d->fe_adap[0].fe = dvb_attach(stv0288_attach, &earda_config, &d->dev->i2c_adap); if (d->fe_adap[0].fe) { if (dvb_attach(stb6000_attach, d->fe_adap[0].fe, 0x61, &d->dev->i2c_adap)) { d->fe_adap[0].fe->ops.set_voltage = dw210x_set_voltage; info("Attached stv0288!"); return 0; } } } if (dw2102_properties.i2c_algo == &dw2102_i2c_algo) { /*dw2102_properties.adapter->tuner_attach = dw2102_tuner_attach;*/ d->fe_adap[0].fe = dvb_attach(stv0299_attach, &sharp_z0194a_config, &d->dev->i2c_adap); if (d->fe_adap[0].fe) { d->fe_adap[0].fe->ops.set_voltage = dw210x_set_voltage; info("Attached stv0299!"); return 0; } } return -EIO; } static int dw3101_frontend_attach(struct dvb_usb_adapter *d) { d->fe_adap[0].fe = dvb_attach(tda10023_attach, &dw3101_tda10023_config, &d->dev->i2c_adap, 0x48); if (d->fe_adap[0].fe) { info("Attached tda10023!"); return 0; } return -EIO; } static int zl100313_frontend_attach(struct dvb_usb_adapter *d) { d->fe_adap[0].fe = dvb_attach(mt312_attach, &zl313_config, &d->dev->i2c_adap); if (d->fe_adap[0].fe) { if (dvb_attach(zl10039_attach, d->fe_adap[0].fe, 0x60, &d->dev->i2c_adap)) { d->fe_adap[0].fe->ops.set_voltage = dw210x_set_voltage; info("Attached zl100313+zl10039!"); return 0; } } return -EIO; } static int stv0288_frontend_attach(struct dvb_usb_adapter *d) { u8 obuf[] = {7, 1}; d->fe_adap[0].fe = dvb_attach(stv0288_attach, &earda_config, &d->dev->i2c_adap); if (!d->fe_adap[0].fe) return -EIO; if (dvb_attach(stb6000_attach, d->fe_adap[0].fe, 0x61, &d->dev->i2c_adap) == NULL) return -EIO; d->fe_adap[0].fe->ops.set_voltage = dw210x_set_voltage; dw210x_op_rw(d->dev->udev, 0x8a, 0, 0, obuf, 2, DW210X_WRITE_MSG); info("Attached stv0288+stb6000!"); return 0; } static int ds3000_frontend_attach(struct dvb_usb_adapter *d) { struct dw2102_state *st = d->dev->priv; u8 obuf[] = {7, 1}; d->fe_adap[0].fe = dvb_attach(ds3000_attach, &s660_ds3000_config, &d->dev->i2c_adap); if (!d->fe_adap[0].fe) return -EIO; dvb_attach(ts2020_attach, d->fe_adap[0].fe, &s660_ts2020_config, &d->dev->i2c_adap); st->old_set_voltage = d->fe_adap[0].fe->ops.set_voltage; d->fe_adap[0].fe->ops.set_voltage = s660_set_voltage; dw210x_op_rw(d->dev->udev, 0x8a, 0, 0, obuf, 2, DW210X_WRITE_MSG); info("Attached ds3000+ts2020!"); return 0; } static int prof_7500_frontend_attach(struct dvb_usb_adapter *d) { u8 obuf[] = {7, 1}; d->fe_adap[0].fe = dvb_attach(stv0900_attach, &prof_7500_stv0900_config, &d->dev->i2c_adap, 0); if (!d->fe_adap[0].fe) return -EIO; d->fe_adap[0].fe->ops.set_voltage = dw210x_set_voltage; dw210x_op_rw(d->dev->udev, 0x8a, 0, 0, obuf, 2, DW210X_WRITE_MSG); info("Attached STV0900+STB6100A!"); return 0; } static int su3000_frontend_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap->dev; struct dw2102_state *state = d->priv; mutex_lock(&d->data_mutex); state->data[0] = 0xe; state->data[1] = 0x80; state->data[2] = 0; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 1, 0) < 0) err("command 0x0e transfer failed."); state->data[0] = 0xe; state->data[1] = 0x02; state->data[2] = 1; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 1, 0) < 0) err("command 0x0e transfer failed."); msleep(300); state->data[0] = 0xe; state->data[1] = 0x83; state->data[2] = 0; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 1, 0) < 0) err("command 0x0e transfer failed."); state->data[0] = 0xe; state->data[1] = 0x83; state->data[2] = 1; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 1, 0) < 0) err("command 0x0e transfer failed."); state->data[0] = 0x51; if (dvb_usb_generic_rw(d, state->data, 1, state->data, 1, 0) < 0) err("command 0x51 transfer failed."); mutex_unlock(&d->data_mutex); adap->fe_adap[0].fe = dvb_attach(ds3000_attach, &su3000_ds3000_config, &d->i2c_adap); if (!adap->fe_adap[0].fe) return -EIO; if (dvb_attach(ts2020_attach, adap->fe_adap[0].fe, &dw2104_ts2020_config, &d->i2c_adap)) { info("Attached DS3000/TS2020!"); return 0; } info("Failed to attach DS3000/TS2020!"); return -EIO; } static int t220_frontend_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap->dev; struct dw2102_state *state = d->priv; mutex_lock(&d->data_mutex); state->data[0] = 0xe; state->data[1] = 0x87; state->data[2] = 0x0; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 1, 0) < 0) err("command 0x0e transfer failed."); state->data[0] = 0xe; state->data[1] = 0x86; state->data[2] = 1; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 1, 0) < 0) err("command 0x0e transfer failed."); state->data[0] = 0xe; state->data[1] = 0x80; state->data[2] = 0; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 1, 0) < 0) err("command 0x0e transfer failed."); msleep(50); state->data[0] = 0xe; state->data[1] = 0x80; state->data[2] = 1; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 1, 0) < 0) err("command 0x0e transfer failed."); state->data[0] = 0x51; if (dvb_usb_generic_rw(d, state->data, 1, state->data, 1, 0) < 0) err("command 0x51 transfer failed."); mutex_unlock(&d->data_mutex); adap->fe_adap[0].fe = dvb_attach(cxd2820r_attach, &cxd2820r_config, &d->i2c_adap, NULL); if (adap->fe_adap[0].fe) { if (dvb_attach(tda18271_attach, adap->fe_adap[0].fe, 0x60, &d->i2c_adap, &tda18271_config)) { info("Attached TDA18271HD/CXD2820R!"); return 0; } } info("Failed to attach TDA18271HD/CXD2820R!"); return -EIO; } static int m88rs2000_frontend_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap->dev; struct dw2102_state *state = d->priv; mutex_lock(&d->data_mutex); state->data[0] = 0x51; if (dvb_usb_generic_rw(d, state->data, 1, state->data, 1, 0) < 0) err("command 0x51 transfer failed."); mutex_unlock(&d->data_mutex); adap->fe_adap[0].fe = dvb_attach(m88rs2000_attach, &s421_m88rs2000_config, &d->i2c_adap); if (!adap->fe_adap[0].fe) return -EIO; if (dvb_attach(ts2020_attach, adap->fe_adap[0].fe, &dw2104_ts2020_config, &d->i2c_adap)) { info("Attached RS2000/TS2020!"); return 0; } info("Failed to attach RS2000/TS2020!"); return -EIO; } static int tt_s2_4600_frontend_attach_probe_demod(struct dvb_usb_device *d, const int probe_addr) { struct dw2102_state *state = d->priv; state->data[0] = 0x9; state->data[1] = 0x1; state->data[2] = 0x1; state->data[3] = probe_addr; state->data[4] = 0x0; if (dvb_usb_generic_rw(d, state->data, 5, state->data, 2, 0) < 0) { err("i2c probe for address 0x%x failed.", probe_addr); return 0; } if (state->data[0] != 8) /* fail(7) or error, no device at address */ return 0; /* probing successful */ return 1; } static int tt_s2_4600_frontend_attach(struct dvb_usb_adapter *adap) { struct dvb_usb_device *d = adap->dev; struct dw2102_state *state = d->priv; struct i2c_adapter *i2c_adapter; struct i2c_client *client; struct i2c_board_info board_info; struct m88ds3103_platform_data m88ds3103_pdata = {}; struct ts2020_config ts2020_config = {}; int demod_addr; mutex_lock(&d->data_mutex); state->data[0] = 0xe; state->data[1] = 0x80; state->data[2] = 0x0; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 1, 0) < 0) err("command 0x0e transfer failed."); state->data[0] = 0xe; state->data[1] = 0x02; state->data[2] = 1; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 1, 0) < 0) err("command 0x0e transfer failed."); msleep(300); state->data[0] = 0xe; state->data[1] = 0x83; state->data[2] = 0; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 1, 0) < 0) err("command 0x0e transfer failed."); state->data[0] = 0xe; state->data[1] = 0x83; state->data[2] = 1; if (dvb_usb_generic_rw(d, state->data, 3, state->data, 1, 0) < 0) err("command 0x0e transfer failed."); state->data[0] = 0x51; if (dvb_usb_generic_rw(d, state->data, 1, state->data, 1, 0) < 0) err("command 0x51 transfer failed."); /* probe for demodulator i2c address */ demod_addr = -1; if (tt_s2_4600_frontend_attach_probe_demod(d, 0x68)) demod_addr = 0x68; else if (tt_s2_4600_frontend_attach_probe_demod(d, 0x69)) demod_addr = 0x69; else if (tt_s2_4600_frontend_attach_probe_demod(d, 0x6a)) demod_addr = 0x6a; mutex_unlock(&d->data_mutex); if (demod_addr < 0) { err("probing for demodulator failed. Is the external power switched on?"); return -ENODEV; } /* attach demod */ m88ds3103_pdata.clk = 27000000; m88ds3103_pdata.i2c_wr_max = 33; m88ds3103_pdata.ts_mode = M88DS3103_TS_CI; m88ds3103_pdata.ts_clk = 16000; m88ds3103_pdata.ts_clk_pol = 0; m88ds3103_pdata.spec_inv = 0; m88ds3103_pdata.agc = 0x99; m88ds3103_pdata.agc_inv = 0; m88ds3103_pdata.clk_out = M88DS3103_CLOCK_OUT_ENABLED; m88ds3103_pdata.envelope_mode = 0; m88ds3103_pdata.lnb_hv_pol = 1; m88ds3103_pdata.lnb_en_pol = 0; memset(&board_info, 0, sizeof(board_info)); if (demod_addr == 0x6a) strscpy(board_info.type, "m88ds3103b", I2C_NAME_SIZE); else strscpy(board_info.type, "m88ds3103", I2C_NAME_SIZE); board_info.addr = demod_addr; board_info.platform_data = &m88ds3103_pdata; request_module("m88ds3103"); client = i2c_new_client_device(&d->i2c_adap, &board_info); if (!i2c_client_has_driver(client)) return -ENODEV; if (!try_module_get(client->dev.driver->owner)) { i2c_unregister_device(client); return -ENODEV; } adap->fe_adap[0].fe = m88ds3103_pdata.get_dvb_frontend(client); i2c_adapter = m88ds3103_pdata.get_i2c_adapter(client); state->i2c_client_demod = client; /* attach tuner */ ts2020_config.fe = adap->fe_adap[0].fe; memset(&board_info, 0, sizeof(board_info)); strscpy(board_info.type, "ts2022", I2C_NAME_SIZE); board_info.addr = 0x60; board_info.platform_data = &ts2020_config; request_module("ts2020"); client = i2c_new_client_device(i2c_adapter, &board_info); if (!i2c_client_has_driver(client)) { dvb_frontend_detach(adap->fe_adap[0].fe); return -ENODEV; } if (!try_module_get(client->dev.driver->owner)) { i2c_unregister_device(client); dvb_frontend_detach(adap->fe_adap[0].fe); return -ENODEV; } /* delegate signal strength measurement to tuner */ adap->fe_adap[0].fe->ops.read_signal_strength = adap->fe_adap[0].fe->ops.tuner_ops.get_rf_strength; state->i2c_client_tuner = client; /* hook fe: need to resync the slave fifo when signal locks */ state->fe_read_status = adap->fe_adap[0].fe->ops.read_status; adap->fe_adap[0].fe->ops.read_status = tt_s2_4600_read_status; state->last_lock = 0; return 0; } static int dw2102_tuner_attach(struct dvb_usb_adapter *adap) { dvb_attach(dvb_pll_attach, adap->fe_adap[0].fe, 0x60, &adap->dev->i2c_adap, DVB_PLL_OPERA1); return 0; } static int dw3101_tuner_attach(struct dvb_usb_adapter *adap) { dvb_attach(dvb_pll_attach, adap->fe_adap[0].fe, 0x60, &adap->dev->i2c_adap, DVB_PLL_TUA6034); return 0; } static int dw2102_rc_query(struct dvb_usb_device *d) { u8 key[2]; struct i2c_msg msg = { .addr = DW2102_RC_QUERY, .flags = I2C_M_RD, .buf = key, .len = 2 }; if (d->props.i2c_algo->master_xfer(&d->i2c_adap, &msg, 1) == 1) { if (msg.buf[0] != 0xff) { deb_rc("%s: rc code: %x, %x\n", __func__, key[0], key[1]); rc_keydown(d->rc_dev, RC_PROTO_UNKNOWN, key[0], 0); } } return 0; } static int prof_rc_query(struct dvb_usb_device *d) { u8 key[2]; struct i2c_msg msg = { .addr = DW2102_RC_QUERY, .flags = I2C_M_RD, .buf = key, .len = 2 }; if (d->props.i2c_algo->master_xfer(&d->i2c_adap, &msg, 1) == 1) { if (msg.buf[0] != 0xff) { deb_rc("%s: rc code: %x, %x\n", __func__, key[0], key[1]); rc_keydown(d->rc_dev, RC_PROTO_UNKNOWN, key[0] ^ 0xff, 0); } } return 0; } static int su3000_rc_query(struct dvb_usb_device *d) { u8 key[2]; struct i2c_msg msg = { .addr = DW2102_RC_QUERY, .flags = I2C_M_RD, .buf = key, .len = 2 }; if (d->props.i2c_algo->master_xfer(&d->i2c_adap, &msg, 1) == 1) { if (msg.buf[0] != 0xff) { deb_rc("%s: rc code: %x, %x\n", __func__, key[0], key[1]); rc_keydown(d->rc_dev, RC_PROTO_RC5, RC_SCANCODE_RC5(key[1], key[0]), 0); } } return 0; } enum dw2102_table_entry { CYPRESS_DW2102, CYPRESS_DW2101, CYPRESS_DW2104, TEVII_S650, TERRATEC_CINERGY_S, CYPRESS_DW3101, TEVII_S630, PROF_1100, TEVII_S660, PROF_7500, GENIATECH_SU3000, HAUPPAUGE_MAX_S2, TERRATEC_CINERGY_S2_R1, TEVII_S480_1, TEVII_S480_2, GENIATECH_X3M_SPC1400HD, TEVII_S421, TEVII_S632, TERRATEC_CINERGY_S2_R2, TERRATEC_CINERGY_S2_R3, TERRATEC_CINERGY_S2_R4, TERRATEC_CINERGY_S2_1, TERRATEC_CINERGY_S2_2, GOTVIEW_SAT_HD, GENIATECH_T220, TECHNOTREND_CONNECT_S2_4600, TEVII_S482_1, TEVII_S482_2, TEVII_S662 }; static struct usb_device_id dw2102_table[] = { DVB_USB_DEV(CYPRESS, CYPRESS_DW2102), DVB_USB_DEV(CYPRESS, CYPRESS_DW2101), DVB_USB_DEV(CYPRESS, CYPRESS_DW2104), DVB_USB_DEV(TEVII, TEVII_S650), DVB_USB_DEV(TERRATEC, TERRATEC_CINERGY_S), DVB_USB_DEV(CYPRESS, CYPRESS_DW3101), DVB_USB_DEV(TEVII, TEVII_S630), DVB_USB_DEV(PROF_1, PROF_1100), DVB_USB_DEV(TEVII, TEVII_S660), DVB_USB_DEV(PROF_2, PROF_7500), DVB_USB_DEV(GTEK, GENIATECH_SU3000), DVB_USB_DEV(HAUPPAUGE, HAUPPAUGE_MAX_S2), DVB_USB_DEV(TERRATEC, TERRATEC_CINERGY_S2_R1), DVB_USB_DEV(TEVII, TEVII_S480_1), DVB_USB_DEV(TEVII, TEVII_S480_2), DVB_USB_DEV(GTEK, GENIATECH_X3M_SPC1400HD), DVB_USB_DEV(TEVII, TEVII_S421), DVB_USB_DEV(TEVII, TEVII_S632), DVB_USB_DEV(TERRATEC, TERRATEC_CINERGY_S2_R2), DVB_USB_DEV(TERRATEC, TERRATEC_CINERGY_S2_R3), DVB_USB_DEV(TERRATEC, TERRATEC_CINERGY_S2_R4), DVB_USB_DEV(TERRATEC_2, TERRATEC_CINERGY_S2_1), DVB_USB_DEV(TERRATEC_2, TERRATEC_CINERGY_S2_2), DVB_USB_DEV(GOTVIEW, GOTVIEW_SAT_HD), DVB_USB_DEV(GTEK, GENIATECH_T220), DVB_USB_DEV(TECHNOTREND, TECHNOTREND_CONNECT_S2_4600), DVB_USB_DEV(TEVII, TEVII_S482_1), DVB_USB_DEV(TEVII, TEVII_S482_2), DVB_USB_DEV(TEVII, TEVII_S662), { } }; MODULE_DEVICE_TABLE(usb, dw2102_table); static int dw2102_load_firmware(struct usb_device *dev, const struct firmware *frmwr) { u8 *b, *p; int ret = 0, i; u8 reset; u8 reset16[] = {0, 0, 0, 0, 0, 0, 0}; const struct firmware *fw; switch (le16_to_cpu(dev->descriptor.idProduct)) { case 0x2101: ret = request_firmware(&fw, DW2101_FIRMWARE, &dev->dev); if (ret != 0) { err(err_str, DW2101_FIRMWARE); return ret; } break; default: fw = frmwr; break; } info("start downloading DW210X firmware"); p = kmalloc(fw->size, GFP_KERNEL); reset = 1; /*stop the CPU*/ dw210x_op_rw(dev, 0xa0, 0x7f92, 0, &reset, 1, DW210X_WRITE_MSG); dw210x_op_rw(dev, 0xa0, 0xe600, 0, &reset, 1, DW210X_WRITE_MSG); if (p) { memcpy(p, fw->data, fw->size); for (i = 0; i < fw->size; i += 0x40) { b = (u8 *)p + i; if (dw210x_op_rw(dev, 0xa0, i, 0, b, 0x40, DW210X_WRITE_MSG) != 0x40) { err("error while transferring firmware"); ret = -EINVAL; break; } } /* restart the CPU */ reset = 0; if (ret || dw210x_op_rw(dev, 0xa0, 0x7f92, 0, &reset, 1, DW210X_WRITE_MSG) != 1) { err("could not restart the USB controller CPU."); ret = -EINVAL; } if (ret || dw210x_op_rw(dev, 0xa0, 0xe600, 0, &reset, 1, DW210X_WRITE_MSG) != 1) { err("could not restart the USB controller CPU."); ret = -EINVAL; } /* init registers */ switch (le16_to_cpu(dev->descriptor.idProduct)) { case USB_PID_TEVII_S650: dw2104_properties.rc.core.rc_codes = RC_MAP_TEVII_NEC; fallthrough; case USB_PID_CYPRESS_DW2104: reset = 1; dw210x_op_rw(dev, 0xc4, 0x0000, 0, &reset, 1, DW210X_WRITE_MSG); fallthrough; case USB_PID_CYPRESS_DW3101: reset = 0; dw210x_op_rw(dev, 0xbf, 0x0040, 0, &reset, 0, DW210X_WRITE_MSG); break; case USB_PID_TERRATEC_CINERGY_S: case USB_PID_CYPRESS_DW2102: dw210x_op_rw(dev, 0xbf, 0x0040, 0, &reset, 0, DW210X_WRITE_MSG); dw210x_op_rw(dev, 0xb9, 0x0000, 0, &reset16[0], 2, DW210X_READ_MSG); /* check STV0299 frontend */ dw210x_op_rw(dev, 0xb5, 0, 0, &reset16[0], 2, DW210X_READ_MSG); if ((reset16[0] == 0xa1) || (reset16[0] == 0x80)) { dw2102_properties.i2c_algo = &dw2102_i2c_algo; dw2102_properties.adapter->fe[0].tuner_attach = &dw2102_tuner_attach; break; } /* check STV0288 frontend */ reset16[0] = 0xd0; reset16[1] = 1; reset16[2] = 0; dw210x_op_rw(dev, 0xc2, 0, 0, &reset16[0], 3, DW210X_WRITE_MSG); dw210x_op_rw(dev, 0xc3, 0xd1, 0, &reset16[0], 3, DW210X_READ_MSG); if (reset16[2] == 0x11) { dw2102_properties.i2c_algo = &dw2102_earda_i2c_algo; break; } fallthrough; case 0x2101: dw210x_op_rw(dev, 0xbc, 0x0030, 0, &reset16[0], 2, DW210X_READ_MSG); dw210x_op_rw(dev, 0xba, 0x0000, 0, &reset16[0], 7, DW210X_READ_MSG); dw210x_op_rw(dev, 0xba, 0x0000, 0, &reset16[0], 7, DW210X_READ_MSG); dw210x_op_rw(dev, 0xb9, 0x0000, 0, &reset16[0], 2, DW210X_READ_MSG); break; } msleep(100); kfree(p); } if (le16_to_cpu(dev->descriptor.idProduct) == 0x2101) release_firmware(fw); return ret; } static struct dvb_usb_device_properties dw2102_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = DW2102_FIRMWARE, .no_reconnect = 1, .i2c_algo = &dw2102_serit_i2c_algo, .rc.core = { .rc_interval = 150, .rc_codes = RC_MAP_DM1105_NEC, .module_name = "dw2102", .allowed_protos = RC_PROTO_BIT_NEC, .rc_query = dw2102_rc_query, }, .generic_bulk_ctrl_endpoint = 0x81, /* parameter for the MPEG2-data transfer */ .num_adapters = 1, .download_firmware = dw2102_load_firmware, .read_mac_address = dw210x_read_mac_address, .adapter = { { .num_frontends = 1, .fe = {{ .frontend_attach = dw2102_frontend_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x82, .u = { .bulk = { .buffersize = 4096, } } }, }}, } }, .num_device_descs = 3, .devices = { {"DVBWorld DVB-S 2102 USB2.0", {&dw2102_table[CYPRESS_DW2102], NULL}, {NULL}, }, {"DVBWorld DVB-S 2101 USB2.0", {&dw2102_table[CYPRESS_DW2101], NULL}, {NULL}, }, {"TerraTec Cinergy S USB", {&dw2102_table[TERRATEC_CINERGY_S], NULL}, {NULL}, }, } }; static struct dvb_usb_device_properties dw2104_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = DW2104_FIRMWARE, .no_reconnect = 1, .i2c_algo = &dw2104_i2c_algo, .rc.core = { .rc_interval = 150, .rc_codes = RC_MAP_DM1105_NEC, .module_name = "dw2102", .allowed_protos = RC_PROTO_BIT_NEC, .rc_query = dw2102_rc_query, }, .generic_bulk_ctrl_endpoint = 0x81, /* parameter for the MPEG2-data transfer */ .num_adapters = 1, .download_firmware = dw2102_load_firmware, .read_mac_address = dw210x_read_mac_address, .adapter = { { .num_frontends = 1, .fe = {{ .frontend_attach = dw2104_frontend_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x82, .u = { .bulk = { .buffersize = 4096, } } }, }}, } }, .num_device_descs = 2, .devices = { { "DVBWorld DW2104 USB2.0", {&dw2102_table[CYPRESS_DW2104], NULL}, {NULL}, }, { "TeVii S650 USB2.0", {&dw2102_table[TEVII_S650], NULL}, {NULL}, }, } }; static struct dvb_usb_device_properties dw3101_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .firmware = DW3101_FIRMWARE, .no_reconnect = 1, .i2c_algo = &dw3101_i2c_algo, .rc.core = { .rc_interval = 150, .rc_codes = RC_MAP_DM1105_NEC, .module_name = "dw2102", .allowed_protos = RC_PROTO_BIT_NEC, .rc_query = dw2102_rc_query, }, .generic_bulk_ctrl_endpoint = 0x81, /* parameter for the MPEG2-data transfer */ .num_adapters = 1, .download_firmware = dw2102_load_firmware, .read_mac_address = dw210x_read_mac_address, .adapter = { { .num_frontends = 1, .fe = {{ .frontend_attach = dw3101_frontend_attach, .tuner_attach = dw3101_tuner_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x82, .u = { .bulk = { .buffersize = 4096, } } }, }}, } }, .num_device_descs = 1, .devices = { { "DVBWorld DVB-C 3101 USB2.0", {&dw2102_table[CYPRESS_DW3101], NULL}, {NULL}, }, } }; static struct dvb_usb_device_properties s6x0_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .size_of_priv = sizeof(struct dw2102_state), .firmware = S630_FIRMWARE, .no_reconnect = 1, .i2c_algo = &s6x0_i2c_algo, .rc.core = { .rc_interval = 150, .rc_codes = RC_MAP_TEVII_NEC, .module_name = "dw2102", .allowed_protos = RC_PROTO_BIT_NEC, .rc_query = dw2102_rc_query, }, .generic_bulk_ctrl_endpoint = 0x81, .num_adapters = 1, .download_firmware = dw2102_load_firmware, .read_mac_address = s6x0_read_mac_address, .adapter = { { .num_frontends = 1, .fe = {{ .frontend_attach = zl100313_frontend_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x82, .u = { .bulk = { .buffersize = 4096, } } }, }}, } }, .num_device_descs = 1, .devices = { {"TeVii S630 USB", {&dw2102_table[TEVII_S630], NULL}, {NULL}, }, } }; static struct dvb_usb_device_properties p1100_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .size_of_priv = sizeof(struct dw2102_state), .firmware = P1100_FIRMWARE, .no_reconnect = 1, .i2c_algo = &s6x0_i2c_algo, .rc.core = { .rc_interval = 150, .rc_codes = RC_MAP_TBS_NEC, .module_name = "dw2102", .allowed_protos = RC_PROTO_BIT_NEC, .rc_query = prof_rc_query, }, .generic_bulk_ctrl_endpoint = 0x81, .num_adapters = 1, .download_firmware = dw2102_load_firmware, .read_mac_address = s6x0_read_mac_address, .adapter = { { .num_frontends = 1, .fe = {{ .frontend_attach = stv0288_frontend_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x82, .u = { .bulk = { .buffersize = 4096, } } }, } }, } }, .num_device_descs = 1, .devices = { {"Prof 1100 USB ", {&dw2102_table[PROF_1100], NULL}, {NULL}, }, } }; static struct dvb_usb_device_properties s660_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .size_of_priv = sizeof(struct dw2102_state), .firmware = S660_FIRMWARE, .no_reconnect = 1, .i2c_algo = &s6x0_i2c_algo, .rc.core = { .rc_interval = 150, .rc_codes = RC_MAP_TEVII_NEC, .module_name = "dw2102", .allowed_protos = RC_PROTO_BIT_NEC, .rc_query = dw2102_rc_query, }, .generic_bulk_ctrl_endpoint = 0x81, .num_adapters = 1, .download_firmware = dw2102_load_firmware, .read_mac_address = s6x0_read_mac_address, .adapter = { { .num_frontends = 1, .fe = {{ .frontend_attach = ds3000_frontend_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x82, .u = { .bulk = { .buffersize = 4096, } } }, } }, } }, .num_device_descs = 3, .devices = { {"TeVii S660 USB", {&dw2102_table[TEVII_S660], NULL}, {NULL}, }, {"TeVii S480.1 USB", {&dw2102_table[TEVII_S480_1], NULL}, {NULL}, }, {"TeVii S480.2 USB", {&dw2102_table[TEVII_S480_2], NULL}, {NULL}, }, } }; static struct dvb_usb_device_properties p7500_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .size_of_priv = sizeof(struct dw2102_state), .firmware = P7500_FIRMWARE, .no_reconnect = 1, .i2c_algo = &s6x0_i2c_algo, .rc.core = { .rc_interval = 150, .rc_codes = RC_MAP_TBS_NEC, .module_name = "dw2102", .allowed_protos = RC_PROTO_BIT_NEC, .rc_query = prof_rc_query, }, .generic_bulk_ctrl_endpoint = 0x81, .num_adapters = 1, .download_firmware = dw2102_load_firmware, .read_mac_address = s6x0_read_mac_address, .adapter = { { .num_frontends = 1, .fe = {{ .frontend_attach = prof_7500_frontend_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x82, .u = { .bulk = { .buffersize = 4096, } } }, } }, } }, .num_device_descs = 1, .devices = { {"Prof 7500 USB DVB-S2", {&dw2102_table[PROF_7500], NULL}, {NULL}, }, } }; static struct dvb_usb_device_properties su3000_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .size_of_priv = sizeof(struct dw2102_state), .power_ctrl = su3000_power_ctrl, .num_adapters = 1, .identify_state = su3000_identify_state, .i2c_algo = &su3000_i2c_algo, .rc.core = { .rc_interval = 150, .rc_codes = RC_MAP_SU3000, .module_name = "dw2102", .allowed_protos = RC_PROTO_BIT_RC5, .rc_query = su3000_rc_query, }, .read_mac_address = su3000_read_mac_address, .generic_bulk_ctrl_endpoint = 0x01, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = su3000_streaming_ctrl, .frontend_attach = su3000_frontend_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x82, .u = { .bulk = { .buffersize = 4096, } } } }}, } }, .num_device_descs = 9, .devices = { { "SU3000HD DVB-S USB2.0", { &dw2102_table[GENIATECH_SU3000], NULL }, { NULL }, }, { "Hauppauge MAX S2 or WinTV NOVA HD USB2.0", { &dw2102_table[HAUPPAUGE_MAX_S2], NULL }, { NULL }, }, { "Terratec Cinergy S2 USB HD", { &dw2102_table[TERRATEC_CINERGY_S2_R1], NULL }, { NULL }, }, { "X3M TV SPC1400HD PCI", { &dw2102_table[GENIATECH_X3M_SPC1400HD], NULL }, { NULL }, }, { "Terratec Cinergy S2 USB HD Rev.2", { &dw2102_table[TERRATEC_CINERGY_S2_R2], NULL }, { NULL }, }, { "Terratec Cinergy S2 USB HD Rev.3", { &dw2102_table[TERRATEC_CINERGY_S2_R3], NULL }, { NULL }, }, { "Terratec Cinergy S2 PCIe Dual Port 1", { &dw2102_table[TERRATEC_CINERGY_S2_1], NULL }, { NULL }, }, { "Terratec Cinergy S2 PCIe Dual Port 2", { &dw2102_table[TERRATEC_CINERGY_S2_2], NULL }, { NULL }, }, { "GOTVIEW Satellite HD", { &dw2102_table[GOTVIEW_SAT_HD], NULL }, { NULL }, }, } }; static struct dvb_usb_device_properties s421_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .size_of_priv = sizeof(struct dw2102_state), .power_ctrl = su3000_power_ctrl, .num_adapters = 1, .identify_state = su3000_identify_state, .i2c_algo = &su3000_i2c_algo, .rc.core = { .rc_interval = 150, .rc_codes = RC_MAP_SU3000, .module_name = "dw2102", .allowed_protos = RC_PROTO_BIT_RC5, .rc_query = su3000_rc_query, }, .read_mac_address = su3000_read_mac_address, .generic_bulk_ctrl_endpoint = 0x01, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = su3000_streaming_ctrl, .frontend_attach = m88rs2000_frontend_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x82, .u = { .bulk = { .buffersize = 4096, } } } } }, } }, .num_device_descs = 2, .devices = { { "TeVii S421 PCI", { &dw2102_table[TEVII_S421], NULL }, { NULL }, }, { "TeVii S632 USB", { &dw2102_table[TEVII_S632], NULL }, { NULL }, }, } }; static struct dvb_usb_device_properties t220_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .size_of_priv = sizeof(struct dw2102_state), .power_ctrl = su3000_power_ctrl, .num_adapters = 1, .identify_state = su3000_identify_state, .i2c_algo = &su3000_i2c_algo, .rc.core = { .rc_interval = 150, .rc_codes = RC_MAP_SU3000, .module_name = "dw2102", .allowed_protos = RC_PROTO_BIT_RC5, .rc_query = su3000_rc_query, }, .read_mac_address = su3000_read_mac_address, .generic_bulk_ctrl_endpoint = 0x01, .adapter = { { .num_frontends = 1, .fe = { { .streaming_ctrl = su3000_streaming_ctrl, .frontend_attach = t220_frontend_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x82, .u = { .bulk = { .buffersize = 4096, } } } } }, } }, .num_device_descs = 1, .devices = { { "Geniatech T220 DVB-T/T2 USB2.0", { &dw2102_table[GENIATECH_T220], NULL }, { NULL }, }, } }; static struct dvb_usb_device_properties tt_s2_4600_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, .usb_ctrl = DEVICE_SPECIFIC, .size_of_priv = sizeof(struct dw2102_state), .power_ctrl = su3000_power_ctrl, .num_adapters = 1, .identify_state = su3000_identify_state, .i2c_algo = &su3000_i2c_algo, .rc.core = { .rc_interval = 250, .rc_codes = RC_MAP_TT_1500, .module_name = "dw2102", .allowed_protos = RC_PROTO_BIT_RC5, .rc_query = su3000_rc_query, }, .read_mac_address = su3000_read_mac_address, .generic_bulk_ctrl_endpoint = 0x01, .adapter = { { .num_frontends = 1, .fe = {{ .streaming_ctrl = su3000_streaming_ctrl, .frontend_attach = tt_s2_4600_frontend_attach, .stream = { .type = USB_BULK, .count = 8, .endpoint = 0x82, .u = { .bulk = { .buffersize = 4096, } } } } }, } }, .num_device_descs = 5, .devices = { { "TechnoTrend TT-connect S2-4600", { &dw2102_table[TECHNOTREND_CONNECT_S2_4600], NULL }, { NULL }, }, { "TeVii S482 (tuner 1)", { &dw2102_table[TEVII_S482_1], NULL }, { NULL }, }, { "TeVii S482 (tuner 2)", { &dw2102_table[TEVII_S482_2], NULL }, { NULL }, }, { "Terratec Cinergy S2 USB BOX", { &dw2102_table[TERRATEC_CINERGY_S2_R4], NULL }, { NULL }, }, { "TeVii S662", { &dw2102_table[TEVII_S662], NULL }, { NULL }, }, } }; static int dw2102_probe(struct usb_interface *intf, const struct usb_device_id *id) { if (!(dvb_usb_device_init(intf, &dw2102_properties, THIS_MODULE, NULL, adapter_nr) && dvb_usb_device_init(intf, &dw2104_properties, THIS_MODULE, NULL, adapter_nr) && dvb_usb_device_init(intf, &dw3101_properties, THIS_MODULE, NULL, adapter_nr) && dvb_usb_device_init(intf, &s6x0_properties, THIS_MODULE, NULL, adapter_nr) && dvb_usb_device_init(intf, &p1100_properties, THIS_MODULE, NULL, adapter_nr) && dvb_usb_device_init(intf, &s660_properties, THIS_MODULE, NULL, adapter_nr) && dvb_usb_device_init(intf, &p7500_properties, THIS_MODULE, NULL, adapter_nr) && dvb_usb_device_init(intf, &s421_properties, THIS_MODULE, NULL, adapter_nr) && dvb_usb_device_init(intf, &su3000_properties, THIS_MODULE, NULL, adapter_nr) && dvb_usb_device_init(intf, &t220_properties, THIS_MODULE, NULL, adapter_nr) && dvb_usb_device_init(intf, &tt_s2_4600_properties, THIS_MODULE, NULL, adapter_nr))) { return 0; } return -ENODEV; } static void dw2102_disconnect(struct usb_interface *intf) { struct dvb_usb_device *d = usb_get_intfdata(intf); struct dw2102_state *st = d->priv; struct i2c_client *client; /* remove I2C client for tuner */ client = st->i2c_client_tuner; if (client) { module_put(client->dev.driver->owner); i2c_unregister_device(client); } /* remove I2C client for demodulator */ client = st->i2c_client_demod; if (client) { module_put(client->dev.driver->owner); i2c_unregister_device(client); } dvb_usb_device_exit(intf); } static struct usb_driver dw2102_driver = { .name = "dw2102", .probe = dw2102_probe, .disconnect = dw2102_disconnect, .id_table = dw2102_table, }; module_usb_driver(dw2102_driver); MODULE_AUTHOR("Igor M. Liplianin (c) liplianin@me.by"); MODULE_DESCRIPTION("Driver for DVBWorld DVB-S 2101, 2102, DVB-S2 2104, DVB-C 3101 USB2.0, TeVii S421, S480, S482, S600, S630, S632, S650, TeVii S660, S662, Prof 1100, 7500 USB2.0, Geniatech SU3000, T220, TechnoTrend S2-4600, Terratec Cinergy S2 devices"); MODULE_VERSION("0.1"); MODULE_LICENSE("GPL"); MODULE_FIRMWARE(DW2101_FIRMWARE); MODULE_FIRMWARE(DW2102_FIRMWARE); MODULE_FIRMWARE(DW2104_FIRMWARE); MODULE_FIRMWARE(DW3101_FIRMWARE); MODULE_FIRMWARE(S630_FIRMWARE); MODULE_FIRMWARE(S660_FIRMWARE); MODULE_FIRMWARE(P1100_FIRMWARE); MODULE_FIRMWARE(P7500_FIRMWARE); |
65 9 78 68 431 56 363 150 165 3 3414 1475 10 7878 7866 2164 370 1013 4 3 15 1 11092 2342 981 26 1816 1824 1999 15 406 65 6 5 73 9 576 42 15 6 3 2 10 2 38 22 97 1772 389 1938 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_NETLINK_H #define __NET_NETLINK_H #include <linux/types.h> #include <linux/netlink.h> #include <linux/jiffies.h> #include <linux/in6.h> /* ======================================================================== * Netlink Messages and Attributes Interface (As Seen On TV) * ------------------------------------------------------------------------ * Messages Interface * ------------------------------------------------------------------------ * * Message Format: * <--- nlmsg_total_size(payload) ---> * <-- nlmsg_msg_size(payload) -> * +----------+- - -+-------------+- - -+-------- - - * | nlmsghdr | Pad | Payload | Pad | nlmsghdr * +----------+- - -+-------------+- - -+-------- - - * nlmsg_data(nlh)---^ ^ * nlmsg_next(nlh)-----------------------+ * * Payload Format: * <---------------------- nlmsg_len(nlh) ---------------------> * <------ hdrlen ------> <- nlmsg_attrlen(nlh, hdrlen) -> * +----------------------+- - -+--------------------------------+ * | Family Header | Pad | Attributes | * +----------------------+- - -+--------------------------------+ * nlmsg_attrdata(nlh, hdrlen)---^ * * Data Structures: * struct nlmsghdr netlink message header * * Message Construction: * nlmsg_new() create a new netlink message * nlmsg_put() add a netlink message to an skb * nlmsg_put_answer() callback based nlmsg_put() * nlmsg_end() finalize netlink message * nlmsg_get_pos() return current position in message * nlmsg_trim() trim part of message * nlmsg_cancel() cancel message construction * nlmsg_consume() free a netlink message (expected) * nlmsg_free() free a netlink message (drop) * * Message Sending: * nlmsg_multicast() multicast message to several groups * nlmsg_unicast() unicast a message to a single socket * nlmsg_notify() send notification message * * Message Length Calculations: * nlmsg_msg_size(payload) length of message w/o padding * nlmsg_total_size(payload) length of message w/ padding * nlmsg_padlen(payload) length of padding at tail * * Message Payload Access: * nlmsg_data(nlh) head of message payload * nlmsg_len(nlh) length of message payload * nlmsg_attrdata(nlh, hdrlen) head of attributes data * nlmsg_attrlen(nlh, hdrlen) length of attributes data * * Message Parsing: * nlmsg_ok(nlh, remaining) does nlh fit into remaining bytes? * nlmsg_next(nlh, remaining) get next netlink message * nlmsg_parse() parse attributes of a message * nlmsg_find_attr() find an attribute in a message * nlmsg_for_each_msg() loop over all messages * nlmsg_validate() validate netlink message incl. attrs * nlmsg_for_each_attr() loop over all attributes * * Misc: * nlmsg_report() report back to application? * * ------------------------------------------------------------------------ * Attributes Interface * ------------------------------------------------------------------------ * * Attribute Format: * <------- nla_total_size(payload) -------> * <---- nla_attr_size(payload) -----> * +----------+- - -+- - - - - - - - - +- - -+-------- - - * | Header | Pad | Payload | Pad | Header * +----------+- - -+- - - - - - - - - +- - -+-------- - - * <- nla_len(nla) -> ^ * nla_data(nla)----^ | * nla_next(nla)-----------------------------' * * Data Structures: * struct nlattr netlink attribute header * * Attribute Construction: * nla_reserve(skb, type, len) reserve room for an attribute * nla_reserve_nohdr(skb, len) reserve room for an attribute w/o hdr * nla_put(skb, type, len, data) add attribute to skb * nla_put_nohdr(skb, len, data) add attribute w/o hdr * nla_append(skb, len, data) append data to skb * * Attribute Construction for Basic Types: * nla_put_u8(skb, type, value) add u8 attribute to skb * nla_put_u16(skb, type, value) add u16 attribute to skb * nla_put_u32(skb, type, value) add u32 attribute to skb * nla_put_u64_64bit(skb, type, * value, padattr) add u64 attribute to skb * nla_put_s8(skb, type, value) add s8 attribute to skb * nla_put_s16(skb, type, value) add s16 attribute to skb * nla_put_s32(skb, type, value) add s32 attribute to skb * nla_put_s64(skb, type, value, * padattr) add s64 attribute to skb * nla_put_string(skb, type, str) add string attribute to skb * nla_put_flag(skb, type) add flag attribute to skb * nla_put_msecs(skb, type, jiffies, * padattr) add msecs attribute to skb * nla_put_in_addr(skb, type, addr) add IPv4 address attribute to skb * nla_put_in6_addr(skb, type, addr) add IPv6 address attribute to skb * * Nested Attributes Construction: * nla_nest_start(skb, type) start a nested attribute * nla_nest_end(skb, nla) finalize a nested attribute * nla_nest_cancel(skb, nla) cancel nested attribute construction * * Attribute Length Calculations: * nla_attr_size(payload) length of attribute w/o padding * nla_total_size(payload) length of attribute w/ padding * nla_padlen(payload) length of padding * * Attribute Payload Access: * nla_data(nla) head of attribute payload * nla_len(nla) length of attribute payload * * Attribute Payload Access for Basic Types: * nla_get_uint(nla) get payload for a uint attribute * nla_get_sint(nla) get payload for a sint attribute * nla_get_u8(nla) get payload for a u8 attribute * nla_get_u16(nla) get payload for a u16 attribute * nla_get_u32(nla) get payload for a u32 attribute * nla_get_u64(nla) get payload for a u64 attribute * nla_get_s8(nla) get payload for a s8 attribute * nla_get_s16(nla) get payload for a s16 attribute * nla_get_s32(nla) get payload for a s32 attribute * nla_get_s64(nla) get payload for a s64 attribute * nla_get_flag(nla) return 1 if flag is true * nla_get_msecs(nla) get payload for a msecs attribute * * The same functions also exist with _default(). * * Attribute Misc: * nla_memcpy(dest, nla, count) copy attribute into memory * nla_memcmp(nla, data, size) compare attribute with memory area * nla_strscpy(dst, nla, size) copy attribute to a sized string * nla_strcmp(nla, str) compare attribute with string * * Attribute Parsing: * nla_ok(nla, remaining) does nla fit into remaining bytes? * nla_next(nla, remaining) get next netlink attribute * nla_validate() validate a stream of attributes * nla_validate_nested() validate a stream of nested attributes * nla_find() find attribute in stream of attributes * nla_find_nested() find attribute in nested attributes * nla_parse() parse and validate stream of attrs * nla_parse_nested() parse nested attributes * nla_for_each_attr() loop over all attributes * nla_for_each_attr_type() loop over all attributes with the * given type * nla_for_each_nested() loop over the nested attributes * nla_for_each_nested_type() loop over the nested attributes with * the given type *========================================================================= */ /** * Standard attribute types to specify validation policy */ enum { NLA_UNSPEC, NLA_U8, NLA_U16, NLA_U32, NLA_U64, NLA_STRING, NLA_FLAG, NLA_MSECS, NLA_NESTED, NLA_NESTED_ARRAY, NLA_NUL_STRING, NLA_BINARY, NLA_S8, NLA_S16, NLA_S32, NLA_S64, NLA_BITFIELD32, NLA_REJECT, NLA_BE16, NLA_BE32, NLA_SINT, NLA_UINT, __NLA_TYPE_MAX, }; #define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1) struct netlink_range_validation { u64 min, max; }; struct netlink_range_validation_signed { s64 min, max; }; enum nla_policy_validation { NLA_VALIDATE_NONE, NLA_VALIDATE_RANGE, NLA_VALIDATE_RANGE_WARN_TOO_LONG, NLA_VALIDATE_MIN, NLA_VALIDATE_MAX, NLA_VALIDATE_MASK, NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, }; /** * struct nla_policy - attribute validation policy * @type: Type of attribute or NLA_UNSPEC * @validation_type: type of attribute validation done in addition to * type-specific validation (e.g. range, function call), see * &enum nla_policy_validation * @len: Type specific length of payload * * Policies are defined as arrays of this struct, the array must be * accessible by attribute type up to the highest identifier to be expected. * * Meaning of `len' field: * NLA_STRING Maximum length of string * NLA_NUL_STRING Maximum length of string (excluding NUL) * NLA_FLAG Unused * NLA_BINARY Maximum length of attribute payload * (but see also below with the validation type) * NLA_NESTED, * NLA_NESTED_ARRAY Length verification is done by checking len of * nested header (or empty); len field is used if * nested_policy is also used, for the max attr * number in the nested policy. * NLA_SINT, NLA_UINT, * NLA_U8, NLA_U16, * NLA_U32, NLA_U64, * NLA_S8, NLA_S16, * NLA_S32, NLA_S64, * NLA_BE16, NLA_BE32, * NLA_MSECS Leaving the length field zero will verify the * given type fits, using it verifies minimum length * just like "All other" * NLA_BITFIELD32 Unused * NLA_REJECT Unused * All other Minimum length of attribute payload * * Meaning of validation union: * NLA_BITFIELD32 This is a 32-bit bitmap/bitselector attribute and * `bitfield32_valid' is the u32 value of valid flags * NLA_REJECT This attribute is always rejected and `reject_message' * may point to a string to report as the error instead * of the generic one in extended ACK. * NLA_NESTED `nested_policy' to a nested policy to validate, must * also set `len' to the max attribute number. Use the * provided NLA_POLICY_NESTED() macro. * Note that nla_parse() will validate, but of course not * parse, the nested sub-policies. * NLA_NESTED_ARRAY `nested_policy' points to a nested policy to validate, * must also set `len' to the max attribute number. Use * the provided NLA_POLICY_NESTED_ARRAY() macro. * The difference to NLA_NESTED is the structure: * NLA_NESTED has the nested attributes directly inside * while an array has the nested attributes at another * level down and the attribute types directly in the * nesting don't matter. * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64, * NLA_BE16, * NLA_BE32, * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, * NLA_S64 The `min' and `max' fields are used depending on the * validation_type field, if that is min/max/range then * the min, max or both are used (respectively) to check * the value of the integer attribute. * Note that in the interest of code simplicity and * struct size both limits are s16, so you cannot * enforce a range that doesn't fall within the range * of s16 - do that using the NLA_POLICY_FULL_RANGE() * or NLA_POLICY_FULL_RANGE_SIGNED() macros instead. * Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and * NLA_POLICY_RANGE() macros. * NLA_UINT, * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64 If the validation_type field instead is set to * NLA_VALIDATE_RANGE_PTR, `range' must be a pointer * to a struct netlink_range_validation that indicates * the min/max values. * Use NLA_POLICY_FULL_RANGE(). * NLA_SINT, * NLA_S8, * NLA_S16, * NLA_S32, * NLA_S64 If the validation_type field instead is set to * NLA_VALIDATE_RANGE_PTR, `range_signed' must be a * pointer to a struct netlink_range_validation_signed * that indicates the min/max values. * Use NLA_POLICY_FULL_RANGE_SIGNED(). * * NLA_BINARY If the validation type is like the ones for integers * above, then the min/max length (not value like for * integers) of the attribute is enforced. * * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: * NLA_BINARY Validation function called for the attribute. * All other Unused - but note that it's a union * * Example: * * static const u32 myvalidflags = 0xff231023; * * static const struct nla_policy my_policy[ATTR_MAX+1] = { * [ATTR_FOO] = { .type = NLA_U16 }, * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ }, * [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)), * [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags), * }; */ struct nla_policy { u8 type; u8 validation_type; u16 len; union { /** * @strict_start_type: first attribute to validate strictly * * This entry is special, and used for the attribute at index 0 * only, and specifies special data about the policy, namely it * specifies the "boundary type" where strict length validation * starts for any attribute types >= this value, also, strict * nesting validation starts here. * * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT * for any types >= this, so need to use NLA_POLICY_MIN_LEN() to * get the previous pure { .len = xyz } behaviour. The advantage * of this is that types not specified in the policy will be * rejected. * * For completely new families it should be set to 1 so that the * validation is enforced for all attributes. For existing ones * it should be set at least when new attributes are added to * the enum used by the policy, and be set to the new value that * was added to enforce strict validation from thereon. */ u16 strict_start_type; /* private: use NLA_POLICY_*() to set */ const u32 bitfield32_valid; const u32 mask; const char *reject_message; const struct nla_policy *nested_policy; const struct netlink_range_validation *range; const struct netlink_range_validation_signed *range_signed; struct { s16 min, max; }; int (*validate)(const struct nlattr *attr, struct netlink_ext_ack *extack); }; }; #define NLA_POLICY_ETH_ADDR NLA_POLICY_EXACT_LEN(ETH_ALEN) #define NLA_POLICY_ETH_ADDR_COMPAT NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN) #define _NLA_POLICY_NESTED(maxattr, policy) \ { .type = NLA_NESTED, .nested_policy = policy, .len = maxattr } #define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \ { .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr } #define NLA_POLICY_NESTED(policy) \ _NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy) #define NLA_POLICY_NESTED_ARRAY(policy) \ _NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy) #define NLA_POLICY_BITFIELD32(valid) \ { .type = NLA_BITFIELD32, .bitfield32_valid = valid } #define __NLA_IS_UINT_TYPE(tp) \ (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || \ tp == NLA_U64 || tp == NLA_UINT || \ tp == NLA_BE16 || tp == NLA_BE32) #define __NLA_IS_SINT_TYPE(tp) \ (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64 || \ tp == NLA_SINT) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_UINT_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp) #define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_SINT_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_SINT_TYPE(tp)) + tp) #define NLA_ENSURE_INT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ __NLA_IS_SINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_NO_VALIDATION_PTR(tp) \ (__NLA_ENSURE(tp != NLA_BITFIELD32 && \ tp != NLA_REJECT && \ tp != NLA_NESTED && \ tp != NLA_NESTED_ARRAY) + tp) #define NLA_POLICY_RANGE(tp, _min, _max) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE, \ .min = _min, \ .max = _max \ } #define NLA_POLICY_FULL_RANGE(tp, _range) { \ .type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE_PTR, \ .range = _range, \ } #define NLA_POLICY_FULL_RANGE_SIGNED(tp, _range) { \ .type = NLA_ENSURE_SINT_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE_PTR, \ .range_signed = _range, \ } #define NLA_POLICY_MIN(tp, _min) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MIN, \ .min = _min, \ } #define NLA_POLICY_MAX(tp, _max) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MAX, \ .max = _max, \ } #define NLA_POLICY_MASK(tp, _mask) { \ .type = NLA_ENSURE_UINT_TYPE(tp), \ .validation_type = NLA_VALIDATE_MASK, \ .mask = _mask, \ } #define NLA_POLICY_VALIDATE_FN(tp, fn, ...) { \ .type = NLA_ENSURE_NO_VALIDATION_PTR(tp), \ .validation_type = NLA_VALIDATE_FUNCTION, \ .validate = fn, \ .len = __VA_ARGS__ + 0, \ } #define NLA_POLICY_EXACT_LEN(_len) NLA_POLICY_RANGE(NLA_BINARY, _len, _len) #define NLA_POLICY_EXACT_LEN_WARN(_len) { \ .type = NLA_BINARY, \ .validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG, \ .min = _len, \ .max = _len \ } #define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len) #define NLA_POLICY_MAX_LEN(_len) NLA_POLICY_MAX(NLA_BINARY, _len) /** * struct nl_info - netlink source information * @nlh: Netlink message header of original request * @nl_net: Network namespace * @portid: Netlink PORTID of requesting application * @skip_notify: Skip netlink notifications to user space * @skip_notify_kernel: Skip selected in-kernel notifications */ struct nl_info { struct nlmsghdr *nlh; struct net *nl_net; u32 portid; u8 skip_notify:1, skip_notify_kernel:1; }; /** * enum netlink_validation - netlink message/attribute validation levels * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about * extra data at the end of the message, attributes being longer than * they should be, or unknown attributes being present. * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing. * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING * this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict(). * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy. * This can safely be set by the kernel when the given policy has no * NLA_UNSPEC anymore, and can thus be used to ensure policy entries * are enforced going forward. * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g. * U8, U16, U32 must have exact size, etc.) * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY) * and unset for other policies. */ enum netlink_validation { NL_VALIDATE_LIBERAL = 0, NL_VALIDATE_TRAILING = BIT(0), NL_VALIDATE_MAXTYPE = BIT(1), NL_VALIDATE_UNSPEC = BIT(2), NL_VALIDATE_STRICT_ATTRS = BIT(3), NL_VALIDATE_NESTED = BIT(4), }; #define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\ NL_VALIDATE_MAXTYPE) #define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\ NL_VALIDATE_MAXTYPE |\ NL_VALIDATE_UNSPEC |\ NL_VALIDATE_STRICT_ATTRS |\ NL_VALIDATE_NESTED) int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)); int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags); int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack); int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack); int nla_policy_len(const struct nla_policy *, int); struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype); ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize); char *nla_strdup(const struct nlattr *nla, gfp_t flags); int nla_memcpy(void *dest, const struct nlattr *src, int count); int nla_memcmp(const struct nlattr *nla, const void *data, size_t size); int nla_strcmp(const struct nlattr *nla, const char *str); struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen); struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr); void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen); struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen); struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr); void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen); void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data); void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr); void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data); int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data); int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr); int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data); int nla_append(struct sk_buff *skb, int attrlen, const void *data); /************************************************************************** * Netlink Messages **************************************************************************/ /** * nlmsg_msg_size - length of netlink message not including padding * @payload: length of message payload */ static inline int nlmsg_msg_size(int payload) { return NLMSG_HDRLEN + payload; } /** * nlmsg_total_size - length of netlink message including padding * @payload: length of message payload */ static inline int nlmsg_total_size(int payload) { return NLMSG_ALIGN(nlmsg_msg_size(payload)); } /** * nlmsg_padlen - length of padding at the message's tail * @payload: length of message payload */ static inline int nlmsg_padlen(int payload) { return nlmsg_total_size(payload) - nlmsg_msg_size(payload); } /** * nlmsg_data - head of message payload * @nlh: netlink message header */ static inline void *nlmsg_data(const struct nlmsghdr *nlh) { return (unsigned char *) nlh + NLMSG_HDRLEN; } /** * nlmsg_len - length of message payload * @nlh: netlink message header */ static inline int nlmsg_len(const struct nlmsghdr *nlh) { return nlh->nlmsg_len - NLMSG_HDRLEN; } /** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header */ static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh, int hdrlen) { unsigned char *data = nlmsg_data(nlh); return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen)); } /** * nlmsg_attrlen - length of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header */ static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen) { return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen); } /** * nlmsg_ok - check if the netlink message fits into the remaining bytes * @nlh: netlink message header * @remaining: number of bytes remaining in message stream */ static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining) { return (remaining >= (int) sizeof(struct nlmsghdr) && nlh->nlmsg_len >= sizeof(struct nlmsghdr) && nlh->nlmsg_len <= remaining); } /** * nlmsg_next - next netlink message in message stream * @nlh: netlink message header * @remaining: number of bytes remaining in message stream * * Returns the next netlink message in the message stream and * decrements remaining by the size of the current message. */ static inline struct nlmsghdr * nlmsg_next(const struct nlmsghdr *nlh, int *remaining) { int totlen = NLMSG_ALIGN(nlh->nlmsg_len); *remaining -= totlen; return (struct nlmsghdr *) ((unsigned char *) nlh + totlen); } /** * nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be rejected, policy must be specified, attributes * will be validated in the strictest way possible. * * Returns 0 on success or a negative error code. */ static inline int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_STRICT, extack); } /** * nla_parse_deprecated - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be ignored and attributes from the policy are not * always strictly validated (only for new attributes). * * Returns 0 on success or a negative error code. */ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be rejected as well as trailing data, but the * policy is not completely strictly validated (only for new attributes). * * Returns 0 on success or a negative error code. */ static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_DEPRECATED_STRICT, extack); } /** * __nlmsg_parse - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * See nla_parse() */ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) { NL_SET_ERR_MSG(extack, "Invalid header length"); return -EINVAL; } return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy, validate, extack); } /** * nlmsg_parse - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse() */ static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * nlmsg_parse_deprecated - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated() */ static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nlmsg_parse_deprecated_strict - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated_strict() */ static inline int nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_DEPRECATED_STRICT, extack); } /** * nlmsg_find_attr - find a specific attribute in a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @attrtype: type of attribute to look for * * Returns the first attribute which matches the specified type. */ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, int hdrlen, int attrtype) { return nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), attrtype); } /** * nla_validate_deprecated - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in liberal mode. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ static inline int nla_validate_deprecated(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in strict mode. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ static inline int nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * nlmsg_validate_deprecated - validate a netlink message including attributes * @nlh: netlinket message header * @hdrlen: length of family specific header * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh, int hdrlen, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; return __nla_validate(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nlmsg_report - need to report back to application? * @nlh: netlink message header * * Returns 1 if a report back to the application is requested. */ static inline int nlmsg_report(const struct nlmsghdr *nlh) { return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0; } /** * nlmsg_seq - return the seq number of netlink message * @nlh: netlink message header * * Returns 0 if netlink message is NULL */ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) { return nlh ? nlh->nlmsg_seq : 0; } /** * nlmsg_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @nlh: netlink message header * @hdrlen: length of family specific header * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \ nlmsg_attrlen(nlh, hdrlen), rem) /** * nlmsg_put - Add a new netlink message to an skb * @skb: socket buffer to store message in * @portid: netlink PORTID of requesting application * @seq: sequence number of message * @type: message type * @payload: length of message payload * @flags: message flags * * Returns NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int payload, int flags) { if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload))) return NULL; return __nlmsg_put(skb, portid, seq, type, payload, flags); } /** * nlmsg_append - Add more data to a nlmsg in a skb * @skb: socket buffer to store message in * @size: length of message payload * * Append data to an existing nlmsg, used when constructing a message * with multiple fixed-format headers (which is rare). * Returns NULL if the tailroom of the skb is insufficient to store * the extra payload. */ static inline void *nlmsg_append(struct sk_buff *skb, u32 size) { if (unlikely(skb_tailroom(skb) < NLMSG_ALIGN(size))) return NULL; if (NLMSG_ALIGN(size) - size) memset(skb_tail_pointer(skb) + size, 0, NLMSG_ALIGN(size) - size); return __skb_put(skb, NLMSG_ALIGN(size)); } /** * nlmsg_put_answer - Add a new callback based netlink message to an skb * @skb: socket buffer to store message in * @cb: netlink callback * @type: message type * @payload: length of message payload * @flags: message flags * * Returns NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb, struct netlink_callback *cb, int type, int payload, int flags) { return nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, type, payload, flags); } /** * nlmsg_new - Allocate a new netlink message * @payload: size of the message payload * @flags: the type of memory to allocate. * * Use NLMSG_DEFAULT_SIZE if the size of the payload isn't known * and a good default is needed. */ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags) { return alloc_skb(nlmsg_total_size(payload), flags); } /** * nlmsg_new_large - Allocate a new netlink message with non-contiguous * physical memory * @payload: size of the message payload * * The allocated skb is unable to have frag page for shinfo->frags*, * as the NULL setting for skb->head in netlink_skb_destructor() will * bypass most of the handling in skb_release_data() */ static inline struct sk_buff *nlmsg_new_large(size_t payload) { return netlink_alloc_large_skb(nlmsg_total_size(payload), 0); } /** * nlmsg_end - Finalize a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header * * Corrects the netlink message header to include the appended * attributes. Only necessary if attributes have been added to * the message. */ static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) { nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh; } /** * nlmsg_get_pos - return current position in netlink message * @skb: socket buffer the message is stored in * * Returns a pointer to the current tail of the message. */ static inline void *nlmsg_get_pos(struct sk_buff *skb) { return skb_tail_pointer(skb); } /** * nlmsg_trim - Trim message to a mark * @skb: socket buffer the message is stored in * @mark: mark to trim to * * Trims the message to the provided mark. */ static inline void nlmsg_trim(struct sk_buff *skb, const void *mark) { if (mark) { WARN_ON((unsigned char *) mark < skb->data); skb_trim(skb, (unsigned char *) mark - skb->data); } } /** * nlmsg_cancel - Cancel construction of a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header * * Removes the complete netlink message including all * attributes from the socket buffer again. */ static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh) { nlmsg_trim(skb, nlh); } /** * nlmsg_free - drop a netlink message * @skb: socket buffer of netlink message */ static inline void nlmsg_free(struct sk_buff *skb) { kfree_skb(skb); } /** * nlmsg_consume - free a netlink message * @skb: socket buffer of netlink message */ static inline void nlmsg_consume(struct sk_buff *skb) { consume_skb(skb); } /** * nlmsg_multicast_filtered - multicast a netlink message with filter function * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags * @filter: filter function * @filter_data: filter function private data * * Return: 0 on success, negative error code for failure. */ static inline int nlmsg_multicast_filtered(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags, netlink_filter_fn filter, void *filter_data) { int err; NETLINK_CB(skb).dst_group = group; err = netlink_broadcast_filtered(sk, skb, portid, group, flags, filter, filter_data); if (err > 0) err = 0; return err; } /** * nlmsg_multicast - multicast a netlink message * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags */ static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { return nlmsg_multicast_filtered(sk, skb, portid, group, flags, NULL, NULL); } /** * nlmsg_unicast - unicast a netlink message * @sk: netlink socket to spread message to * @skb: netlink message as socket buffer * @portid: netlink portid of the destination socket */ static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 portid) { int err; err = netlink_unicast(sk, skb, portid, MSG_DONTWAIT); if (err > 0) err = 0; return err; } /** * nlmsg_for_each_msg - iterate over a stream of messages * @pos: loop counter, set to current message * @head: head of message stream * @len: length of message stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_msg(pos, head, len, rem) \ for (pos = head, rem = len; \ nlmsg_ok(pos, rem); \ pos = nlmsg_next(pos, &(rem))) /** * nl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number * @nlh: netlink message header to write the flag to * * This function checks if the sequence (generation) number changed during dump * and if it did, advertises it in the netlink message header. * * The correct way to use it is to set cb->seq to the generation counter when * all locks for dumping have been acquired, and then call this function for * each message that is generated. * * Note that due to initialisation concerns, 0 is an invalid sequence number * and must not be used by code that uses this functionality. */ static inline void nl_dump_check_consistent(struct netlink_callback *cb, struct nlmsghdr *nlh) { if (cb->prev_seq && cb->seq != cb->prev_seq) nlh->nlmsg_flags |= NLM_F_DUMP_INTR; cb->prev_seq = cb->seq; } /************************************************************************** * Netlink Attributes **************************************************************************/ /** * nla_attr_size - length of attribute not including padding * @payload: length of payload */ static inline int nla_attr_size(int payload) { return NLA_HDRLEN + payload; } /** * nla_total_size - total length of attribute including padding * @payload: length of payload */ static inline int nla_total_size(int payload) { return NLA_ALIGN(nla_attr_size(payload)); } /** * nla_padlen - length of padding at the tail of attribute * @payload: length of payload */ static inline int nla_padlen(int payload) { return nla_total_size(payload) - nla_attr_size(payload); } /** * nla_type - attribute type * @nla: netlink attribute */ static inline int nla_type(const struct nlattr *nla) { return nla->nla_type & NLA_TYPE_MASK; } /** * nla_data - head of payload * @nla: netlink attribute */ static inline void *nla_data(const struct nlattr *nla) { return (char *) nla + NLA_HDRLEN; } /** * nla_len - length of payload * @nla: netlink attribute */ static inline u16 nla_len(const struct nlattr *nla) { return nla->nla_len - NLA_HDRLEN; } /** * nla_ok - check if the netlink attribute fits into the remaining bytes * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream */ static inline int nla_ok(const struct nlattr *nla, int remaining) { return remaining >= (int) sizeof(*nla) && nla->nla_len >= sizeof(*nla) && nla->nla_len <= remaining; } /** * nla_next - next netlink attribute in attribute stream * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream * * Returns the next netlink attribute in the attribute stream and * decrements remaining by the size of the current attribute. */ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) { unsigned int totlen = NLA_ALIGN(nla->nla_len); *remaining -= totlen; return (struct nlattr *) ((char *) nla + totlen); } /** * nla_find_nested - find attribute in a set of nested attributes * @nla: attribute containing the nested attributes * @attrtype: type of attribute to look for * * Returns the first attribute which matches the specified type. */ static inline struct nlattr * nla_find_nested(const struct nlattr *nla, int attrtype) { return nla_find(nla_data(nla), nla_len(nla), attrtype); } /** * nla_parse_nested - parse nested attributes * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse() */ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, const struct nlattr *nla, const struct nla_policy *policy, struct netlink_ext_ack *extack) { if (!(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing"); return -EINVAL; } return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, NL_VALIDATE_STRICT, extack); } /** * nla_parse_nested_deprecated - parse nested attributes * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated() */ static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype, const struct nlattr *nla, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_put_u8 - Add a u8 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value) { /* temporary variables to work around GCC PR81715 with asan-stack=1 */ u8 tmp = value; return nla_put(skb, attrtype, sizeof(u8), &tmp); } /** * nla_put_u16 - Add a u16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value) { u16 tmp = value; return nla_put(skb, attrtype, sizeof(u16), &tmp); } /** * nla_put_be16 - Add a __be16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value) { __be16 tmp = value; return nla_put(skb, attrtype, sizeof(__be16), &tmp); } /** * nla_put_net16 - Add 16-bit network byte order netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value) { __be16 tmp = value; return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** * nla_put_le16 - Add a __le16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value) { __le16 tmp = value; return nla_put(skb, attrtype, sizeof(__le16), &tmp); } /** * nla_put_u32 - Add a u32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) { u32 tmp = value; return nla_put(skb, attrtype, sizeof(u32), &tmp); } /** * nla_put_uint - Add a variable-size unsigned int to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_uint(struct sk_buff *skb, int attrtype, u64 value) { u64 tmp64 = value; u32 tmp32 = value; if (tmp64 == tmp32) return nla_put_u32(skb, attrtype, tmp32); return nla_put(skb, attrtype, sizeof(u64), &tmp64); } /** * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value) { __be32 tmp = value; return nla_put(skb, attrtype, sizeof(__be32), &tmp); } /** * nla_put_net32 - Add 32-bit network byte order netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value) { __be32 tmp = value; return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** * nla_put_le32 - Add a __le32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value) { __le32 tmp = value; return nla_put(skb, attrtype, sizeof(__le32), &tmp); } /** * nla_put_u64_64bit - Add a u64 netlink attribute to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype, u64 value, int padattr) { u64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** * nla_put_be64 - Add a __be64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { __be64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr); } /** * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { __be64 tmp = value; return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp, padattr); } /** * nla_put_le64 - Add a __le64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value, int padattr) { __le64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr); } /** * nla_put_s8 - Add a s8 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value) { s8 tmp = value; return nla_put(skb, attrtype, sizeof(s8), &tmp); } /** * nla_put_s16 - Add a s16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value) { s16 tmp = value; return nla_put(skb, attrtype, sizeof(s16), &tmp); } /** * nla_put_s32 - Add a s32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value) { s32 tmp = value; return nla_put(skb, attrtype, sizeof(s32), &tmp); } /** * nla_put_s64 - Add a s64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value, int padattr) { s64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr); } /** * nla_put_sint - Add a variable-size signed int to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_sint(struct sk_buff *skb, int attrtype, s64 value) { s64 tmp64 = value; s32 tmp32 = value; if (tmp64 == tmp32) return nla_put_s32(skb, attrtype, tmp32); return nla_put(skb, attrtype, sizeof(s64), &tmp64); } /** * nla_put_string - Add a string netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @str: NUL terminated string */ static inline int nla_put_string(struct sk_buff *skb, int attrtype, const char *str) { return nla_put(skb, attrtype, strlen(str) + 1, str); } /** * nla_put_flag - Add a flag netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type */ static inline int nla_put_flag(struct sk_buff *skb, int attrtype) { return nla_put(skb, attrtype, 0, NULL); } /** * nla_put_msecs - Add a msecs netlink attribute to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @njiffies: number of jiffies to convert to msecs * @padattr: attribute type for the padding */ static inline int nla_put_msecs(struct sk_buff *skb, int attrtype, unsigned long njiffies, int padattr) { u64 tmp = jiffies_to_msecs(njiffies); return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** * nla_put_in_addr - Add an IPv4 address netlink attribute to a socket * buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @addr: IPv4 address */ static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype, __be32 addr) { __be32 tmp = addr; return nla_put_be32(skb, attrtype, tmp); } /** * nla_put_in6_addr - Add an IPv6 address netlink attribute to a socket * buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @addr: IPv6 address */ static inline int nla_put_in6_addr(struct sk_buff *skb, int attrtype, const struct in6_addr *addr) { return nla_put(skb, attrtype, sizeof(*addr), addr); } /** * nla_put_bitfield32 - Add a bitfield32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: value carrying bits * @selector: selector of valid bits */ static inline int nla_put_bitfield32(struct sk_buff *skb, int attrtype, __u32 value, __u32 selector) { struct nla_bitfield32 tmp = { value, selector, }; return nla_put(skb, attrtype, sizeof(tmp), &tmp); } /** * nla_get_u32 - return payload of u32 attribute * @nla: u32 netlink attribute */ static inline u32 nla_get_u32(const struct nlattr *nla) { return *(u32 *) nla_data(nla); } /** * nla_get_u32_default - return payload of u32 attribute or default * @nla: u32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u32 nla_get_u32_default(const struct nlattr *nla, u32 defvalue) { if (!nla) return defvalue; return nla_get_u32(nla); } /** * nla_get_be32 - return payload of __be32 attribute * @nla: __be32 netlink attribute */ static inline __be32 nla_get_be32(const struct nlattr *nla) { return *(__be32 *) nla_data(nla); } /** * nla_get_be32_default - return payload of be32 attribute or default * @nla: __be32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be32 nla_get_be32_default(const struct nlattr *nla, __be32 defvalue) { if (!nla) return defvalue; return nla_get_be32(nla); } /** * nla_get_le32 - return payload of __le32 attribute * @nla: __le32 netlink attribute */ static inline __le32 nla_get_le32(const struct nlattr *nla) { return *(__le32 *) nla_data(nla); } /** * nla_get_le32_default - return payload of le32 attribute or default * @nla: __le32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __le32 nla_get_le32_default(const struct nlattr *nla, __le32 defvalue) { if (!nla) return defvalue; return nla_get_le32(nla); } /** * nla_get_u16 - return payload of u16 attribute * @nla: u16 netlink attribute */ static inline u16 nla_get_u16(const struct nlattr *nla) { return *(u16 *) nla_data(nla); } /** * nla_get_u16_default - return payload of u16 attribute or default * @nla: u16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u16 nla_get_u16_default(const struct nlattr *nla, u16 defvalue) { if (!nla) return defvalue; return nla_get_u16(nla); } /** * nla_get_be16 - return payload of __be16 attribute * @nla: __be16 netlink attribute */ static inline __be16 nla_get_be16(const struct nlattr *nla) { return *(__be16 *) nla_data(nla); } /** * nla_get_be16_default - return payload of be16 attribute or default * @nla: __be16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be16 nla_get_be16_default(const struct nlattr *nla, __be16 defvalue) { if (!nla) return defvalue; return nla_get_be16(nla); } /** * nla_get_le16 - return payload of __le16 attribute * @nla: __le16 netlink attribute */ static inline __le16 nla_get_le16(const struct nlattr *nla) { return *(__le16 *) nla_data(nla); } /** * nla_get_le16_default - return payload of le16 attribute or default * @nla: __le16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __le16 nla_get_le16_default(const struct nlattr *nla, __le16 defvalue) { if (!nla) return defvalue; return nla_get_le16(nla); } /** * nla_get_u8 - return payload of u8 attribute * @nla: u8 netlink attribute */ static inline u8 nla_get_u8(const struct nlattr *nla) { return *(u8 *) nla_data(nla); } /** * nla_get_u8_default - return payload of u8 attribute or default * @nla: u8 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u8 nla_get_u8_default(const struct nlattr *nla, u8 defvalue) { if (!nla) return defvalue; return nla_get_u8(nla); } /** * nla_get_u64 - return payload of u64 attribute * @nla: u64 netlink attribute */ static inline u64 nla_get_u64(const struct nlattr *nla) { u64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_u64_default - return payload of u64 attribute or default * @nla: u64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u64 nla_get_u64_default(const struct nlattr *nla, u64 defvalue) { if (!nla) return defvalue; return nla_get_u64(nla); } /** * nla_get_uint - return payload of uint attribute * @nla: uint netlink attribute */ static inline u64 nla_get_uint(const struct nlattr *nla) { if (nla_len(nla) == sizeof(u32)) return nla_get_u32(nla); return nla_get_u64(nla); } /** * nla_get_uint_default - return payload of uint attribute or default * @nla: uint netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline u64 nla_get_uint_default(const struct nlattr *nla, u64 defvalue) { if (!nla) return defvalue; return nla_get_uint(nla); } /** * nla_get_be64 - return payload of __be64 attribute * @nla: __be64 netlink attribute */ static inline __be64 nla_get_be64(const struct nlattr *nla) { __be64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_be64_default - return payload of be64 attribute or default * @nla: __be64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be64 nla_get_be64_default(const struct nlattr *nla, __be64 defvalue) { if (!nla) return defvalue; return nla_get_be64(nla); } /** * nla_get_le64 - return payload of __le64 attribute * @nla: __le64 netlink attribute */ static inline __le64 nla_get_le64(const struct nlattr *nla) { return *(__le64 *) nla_data(nla); } /** * nla_get_le64_default - return payload of le64 attribute or default * @nla: __le64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __le64 nla_get_le64_default(const struct nlattr *nla, __le64 defvalue) { if (!nla) return defvalue; return nla_get_le64(nla); } /** * nla_get_s32 - return payload of s32 attribute * @nla: s32 netlink attribute */ static inline s32 nla_get_s32(const struct nlattr *nla) { return *(s32 *) nla_data(nla); } /** * nla_get_s32_default - return payload of s32 attribute or default * @nla: s32 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s32 nla_get_s32_default(const struct nlattr *nla, s32 defvalue) { if (!nla) return defvalue; return nla_get_s32(nla); } /** * nla_get_s16 - return payload of s16 attribute * @nla: s16 netlink attribute */ static inline s16 nla_get_s16(const struct nlattr *nla) { return *(s16 *) nla_data(nla); } /** * nla_get_s16_default - return payload of s16 attribute or default * @nla: s16 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s16 nla_get_s16_default(const struct nlattr *nla, s16 defvalue) { if (!nla) return defvalue; return nla_get_s16(nla); } /** * nla_get_s8 - return payload of s8 attribute * @nla: s8 netlink attribute */ static inline s8 nla_get_s8(const struct nlattr *nla) { return *(s8 *) nla_data(nla); } /** * nla_get_s8_default - return payload of s8 attribute or default * @nla: s8 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s8 nla_get_s8_default(const struct nlattr *nla, s8 defvalue) { if (!nla) return defvalue; return nla_get_s8(nla); } /** * nla_get_s64 - return payload of s64 attribute * @nla: s64 netlink attribute */ static inline s64 nla_get_s64(const struct nlattr *nla) { s64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_s64_default - return payload of s64 attribute or default * @nla: s64 netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s64 nla_get_s64_default(const struct nlattr *nla, s64 defvalue) { if (!nla) return defvalue; return nla_get_s64(nla); } /** * nla_get_sint - return payload of uint attribute * @nla: uint netlink attribute */ static inline s64 nla_get_sint(const struct nlattr *nla) { if (nla_len(nla) == sizeof(s32)) return nla_get_s32(nla); return nla_get_s64(nla); } /** * nla_get_sint_default - return payload of sint attribute or default * @nla: sint netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline s64 nla_get_sint_default(const struct nlattr *nla, s64 defvalue) { if (!nla) return defvalue; return nla_get_sint(nla); } /** * nla_get_flag - return payload of flag attribute * @nla: flag netlink attribute */ static inline int nla_get_flag(const struct nlattr *nla) { return !!nla; } /** * nla_get_msecs - return payload of msecs attribute * @nla: msecs netlink attribute * * Returns the number of milliseconds in jiffies. */ static inline unsigned long nla_get_msecs(const struct nlattr *nla) { u64 msecs = nla_get_u64(nla); return msecs_to_jiffies((unsigned long) msecs); } /** * nla_get_msecs_default - return payload of msecs attribute or default * @nla: msecs netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline unsigned long nla_get_msecs_default(const struct nlattr *nla, unsigned long defvalue) { if (!nla) return defvalue; return nla_get_msecs(nla); } /** * nla_get_in_addr - return payload of IPv4 address attribute * @nla: IPv4 address netlink attribute */ static inline __be32 nla_get_in_addr(const struct nlattr *nla) { return *(__be32 *) nla_data(nla); } /** * nla_get_in_addr_default - return payload of be32 attribute or default * @nla: IPv4 address netlink attribute, may be %NULL * @defvalue: default value to use if @nla is %NULL * * Return: the value of the attribute, or the default value if not present */ static inline __be32 nla_get_in_addr_default(const struct nlattr *nla, __be32 defvalue) { if (!nla) return defvalue; return nla_get_in_addr(nla); } /** * nla_get_in6_addr - return payload of IPv6 address attribute * @nla: IPv6 address netlink attribute */ static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla) { struct in6_addr tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_bitfield32 - return payload of 32 bitfield attribute * @nla: nla_bitfield32 attribute */ static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla) { struct nla_bitfield32 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_memdup - duplicate attribute memory (kmemdup) * @src: netlink attribute to duplicate from * @gfp: GFP mask */ static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp) { return kmemdup_noprof(nla_data(src), nla_len(src), gfp); } #define nla_memdup(...) alloc_hooks(nla_memdup_noprof(__VA_ARGS__)) /** * nla_nest_start_noflag - Start a new level of nested attributes * @skb: socket buffer to add attributes to * @attrtype: attribute type of container * * This function exists for backward compatibility to use in APIs which never * marked their nest attributes with NLA_F_NESTED flag. New APIs should use * nla_nest_start() which sets the flag. * * Returns the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, int attrtype) { struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb); if (nla_put(skb, attrtype, 0, NULL) < 0) return NULL; return start; } /** * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED * @skb: socket buffer to add attributes to * @attrtype: attribute type of container * * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED * flag. This is the preferred function to use in new code. * * Returns the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) { return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED); } /** * nla_nest_end - Finalize nesting of attributes * @skb: socket buffer the attributes are stored in * @start: container attribute * * Corrects the container attribute header to include the all * appended attributes. * * Returns the total data length of the skb. */ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start) { start->nla_len = skb_tail_pointer(skb) - (unsigned char *)start; return skb->len; } /** * nla_nest_cancel - Cancel nesting of attributes * @skb: socket buffer the message is stored in * @start: container attribute * * Removes the container attribute and including all nested * attributes. Returns -EMSGSIZE */ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) { nlmsg_trim(skb, start); } /** * __nla_validate_nested - Validate a stream of nested attributes * @start: container attribute * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the nested attribute stream against the * specified policy. Attributes with a type exceeding maxtype will be * ignored. See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate(nla_data(start), nla_len(start), maxtype, policy, validate, extack); } static inline int nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate_nested(start, maxtype, policy, NL_VALIDATE_STRICT, extack); } static inline int nla_validate_nested_deprecated(const struct nlattr *start, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate_nested(start, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute * @skb: socket buffer the message is stored in * * Return true if padding is needed to align the next attribute (nla_data()) to * a 64-bit aligned area. */ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) { #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS /* The nlattr header is 4 bytes in size, that's why we test * if the skb->data _is_ aligned. A NOP attribute, plus * nlattr header for next attribute, will make nla_data() * 8-byte aligned. */ if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8)) return true; #endif return false; } /** * nla_align_64bit - 64-bit align the nla_data() of next attribute * @skb: socket buffer the message is stored in * @padattr: attribute type for the padding * * Conditionally emit a padding netlink attribute in order to make * the next attribute we emit have a 64-bit aligned nla_data() area. * This will only be done in architectures which do not have * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined. * * Returns zero on success or a negative error code. */ static inline int nla_align_64bit(struct sk_buff *skb, int padattr) { if (nla_need_padding_for_64bit(skb) && !nla_reserve(skb, padattr, 0)) return -EMSGSIZE; return 0; } /** * nla_total_size_64bit - total length of attribute including padding * @payload: length of payload */ static inline int nla_total_size_64bit(int payload) { return NLA_ALIGN(nla_attr_size(payload)) #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + NLA_ALIGN(nla_attr_size(0)) #endif ; } /** * nla_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @head: head of attribute stream * @len: length of attribute stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_attr(pos, head, len, rem) \ for (pos = head, rem = len; \ nla_ok(pos, rem); \ pos = nla_next(pos, &(rem))) /** * nla_for_each_attr_type - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @type: required attribute type for @pos * @head: head of attribute stream * @len: length of attribute stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_attr_type(pos, type, head, len, rem) \ nla_for_each_attr(pos, head, len, rem) \ if (nla_type(pos) == type) /** * nla_for_each_nested - iterate over nested attributes * @pos: loop counter, set to current attribute * @nla: attribute containing the nested attributes * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_nested(pos, nla, rem) \ nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem) /** * nla_for_each_nested_type - iterate over nested attributes * @pos: loop counter, set to current attribute * @type: required attribute type for @pos * @nla: attribute containing the nested attributes * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_nested_type(pos, type, nla, rem) \ nla_for_each_nested(pos, nla, rem) \ if (nla_type(pos) == type) /** * nla_is_last - Test if attribute is last in stream * @nla: attribute to test * @rem: bytes remaining in stream */ static inline bool nla_is_last(const struct nlattr *nla, int rem) { return nla->nla_len == rem; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range); void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range); struct netlink_policy_dump_state; int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate, const struct nla_policy *policy, unsigned int maxtype); int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state, const struct nla_policy *policy, unsigned int maxtype); bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state); int netlink_policy_dump_write(struct sk_buff *skb, struct netlink_policy_dump_state *state); int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt); int netlink_policy_dump_write_attr(struct sk_buff *skb, const struct nla_policy *pt, int nestattr); void netlink_policy_dump_free(struct netlink_policy_dump_state *state); #endif |
1 2 15 15 15 9 9 3 1 4 5 4 5 1 12 12 9 6 6 2 1 1 13 13 2 2 77 7 118 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/fs.h> #include <linux/xattr.h> #include "overlayfs.h" static bool ovl_is_escaped_xattr(struct super_block *sb, const char *name) { struct ovl_fs *ofs = sb->s_fs_info; if (ofs->config.userxattr) return strncmp(name, OVL_XATTR_ESCAPE_USER_PREFIX, OVL_XATTR_ESCAPE_USER_PREFIX_LEN) == 0; else return strncmp(name, OVL_XATTR_ESCAPE_TRUSTED_PREFIX, OVL_XATTR_ESCAPE_TRUSTED_PREFIX_LEN - 1) == 0; } static bool ovl_is_own_xattr(struct super_block *sb, const char *name) { struct ovl_fs *ofs = OVL_FS(sb); if (ofs->config.userxattr) return strncmp(name, OVL_XATTR_USER_PREFIX, OVL_XATTR_USER_PREFIX_LEN) == 0; else return strncmp(name, OVL_XATTR_TRUSTED_PREFIX, OVL_XATTR_TRUSTED_PREFIX_LEN) == 0; } bool ovl_is_private_xattr(struct super_block *sb, const char *name) { return ovl_is_own_xattr(sb, name) && !ovl_is_escaped_xattr(sb, name); } static int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { int err; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdentry = ovl_i_dentry_upper(inode); struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); struct path realpath; const struct cred *old_cred; if (!value && !upperdentry) { ovl_path_lower(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getxattr(mnt_idmap(realpath.mnt), realdentry, name, NULL, 0); ovl_revert_creds(old_cred); if (err < 0) goto out; } if (!upperdentry) { err = ovl_copy_up(dentry); if (err) goto out; realdentry = ovl_dentry_upper(dentry); } err = ovl_want_write(dentry); if (err) goto out; old_cred = ovl_override_creds(dentry->d_sb); if (value) { err = ovl_do_setxattr(ofs, realdentry, name, value, size, flags); } else { WARN_ON(flags != XATTR_REPLACE); err = ovl_do_removexattr(ofs, realdentry, name); } ovl_revert_creds(old_cred); ovl_drop_write(dentry); /* copy c/mtime */ ovl_copyattr(inode); out: return err; } static int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { ssize_t res; const struct cred *old_cred; struct path realpath; ovl_i_path_real(inode, &realpath); old_cred = ovl_override_creds(dentry->d_sb); res = vfs_getxattr(mnt_idmap(realpath.mnt), realpath.dentry, name, value, size); ovl_revert_creds(old_cred); return res; } static bool ovl_can_list(struct super_block *sb, const char *s) { /* Never list private (.overlay) */ if (ovl_is_private_xattr(sb, s)) return false; /* List all non-trusted xattrs */ if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) return true; /* list other trusted for superuser only */ return ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); } ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { struct dentry *realdentry = ovl_dentry_real(dentry); struct ovl_fs *ofs = OVL_FS(dentry->d_sb); ssize_t res; size_t len; char *s; const struct cred *old_cred; size_t prefix_len, name_len; old_cred = ovl_override_creds(dentry->d_sb); res = vfs_listxattr(realdentry, list, size); ovl_revert_creds(old_cred); if (res <= 0 || size == 0) return res; prefix_len = ofs->config.userxattr ? OVL_XATTR_USER_PREFIX_LEN : OVL_XATTR_TRUSTED_PREFIX_LEN; /* filter out private xattrs */ for (s = list, len = res; len;) { size_t slen = strnlen(s, len) + 1; /* underlying fs providing us with an broken xattr list? */ if (WARN_ON(slen > len)) return -EIO; len -= slen; if (!ovl_can_list(dentry->d_sb, s)) { res -= slen; memmove(s, s + slen, len); } else if (ovl_is_escaped_xattr(dentry->d_sb, s)) { res -= OVL_XATTR_ESCAPE_PREFIX_LEN; name_len = slen - prefix_len - OVL_XATTR_ESCAPE_PREFIX_LEN; s += prefix_len; memmove(s, s + OVL_XATTR_ESCAPE_PREFIX_LEN, name_len + len); s += name_len; } else { s += slen; } } return res; } static char *ovl_xattr_escape_name(const char *prefix, const char *name) { size_t prefix_len = strlen(prefix); size_t name_len = strlen(name); size_t escaped_len; char *escaped, *s; escaped_len = prefix_len + OVL_XATTR_ESCAPE_PREFIX_LEN + name_len; if (escaped_len > XATTR_NAME_MAX) return ERR_PTR(-EOPNOTSUPP); escaped = kmalloc(escaped_len + 1, GFP_KERNEL); if (escaped == NULL) return ERR_PTR(-ENOMEM); s = escaped; memcpy(s, prefix, prefix_len); s += prefix_len; memcpy(s, OVL_XATTR_ESCAPE_PREFIX, OVL_XATTR_ESCAPE_PREFIX_LEN); s += OVL_XATTR_ESCAPE_PREFIX_LEN; memcpy(s, name, name_len + 1); return escaped; } static int ovl_own_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { char *escaped; int r; escaped = ovl_xattr_escape_name(handler->prefix, name); if (IS_ERR(escaped)) return PTR_ERR(escaped); r = ovl_xattr_get(dentry, inode, escaped, buffer, size); kfree(escaped); return r; } static int ovl_own_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { char *escaped; int r; escaped = ovl_xattr_escape_name(handler->prefix, name); if (IS_ERR(escaped)) return PTR_ERR(escaped); r = ovl_xattr_set(dentry, inode, escaped, value, size, flags); kfree(escaped); return r; } static int ovl_other_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { return ovl_xattr_get(dentry, inode, name, buffer, size); } static int ovl_other_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { return ovl_xattr_set(dentry, inode, name, value, size, flags); } static const struct xattr_handler ovl_own_trusted_xattr_handler = { .prefix = OVL_XATTR_TRUSTED_PREFIX, .get = ovl_own_xattr_get, .set = ovl_own_xattr_set, }; static const struct xattr_handler ovl_own_user_xattr_handler = { .prefix = OVL_XATTR_USER_PREFIX, .get = ovl_own_xattr_get, .set = ovl_own_xattr_set, }; static const struct xattr_handler ovl_other_xattr_handler = { .prefix = "", /* catch all */ .get = ovl_other_xattr_get, .set = ovl_other_xattr_set, }; static const struct xattr_handler * const ovl_trusted_xattr_handlers[] = { &ovl_own_trusted_xattr_handler, &ovl_other_xattr_handler, NULL }; static const struct xattr_handler * const ovl_user_xattr_handlers[] = { &ovl_own_user_xattr_handler, &ovl_other_xattr_handler, NULL }; const struct xattr_handler * const *ovl_xattr_handlers(struct ovl_fs *ofs) { return ofs->config.userxattr ? ovl_user_xattr_handlers : ovl_trusted_xattr_handlers; } |
81 4 77 3 3 3 3 3 174 292 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions implement sctp stream message interleaving, mostly * including I-DATA and I-FORWARD-TSN chunks process. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <net/busy_poll.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/ulpevent.h> #include <linux/sctp.h> static struct sctp_chunk *sctp_make_idatafrag_empty( const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_idatahdr dp; memset(&dp, 0, sizeof(dp)); dp.stream = htons(sinfo->sinfo_stream); if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } static void sctp_chunk_assign_mid(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; __u32 cfsn = 0; __u16 sid; if (chunk->has_mid) return; sid = sctp_chunk_stream_no(chunk); stream = &chunk->asoc->stream; list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) { struct sctp_idatahdr *hdr; __u32 mid; lchunk->has_mid = 1; hdr = lchunk->subh.idata_hdr; if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) hdr->ppid = lchunk->sinfo.sinfo_ppid; else hdr->fsn = htonl(cfsn++); if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_uo_next(stream, out, sid) : sctp_mid_uo_peek(stream, out, sid); } else { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_next(stream, out, sid) : sctp_mid_peek(stream, out, sid); } hdr->mid = htonl(mid); } } static bool sctp_validate_data(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u16 sid, ssn; if (chunk->chunk_hdr->type != SCTP_CID_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); ssn = ntohs(chunk->subh.data_hdr->ssn); return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)); } static bool sctp_validate_idata(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u32 mid; __u16 sid; if (chunk->chunk_hdr->type != SCTP_CID_I_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); mid = ntohl(chunk->subh.idata_hdr->mid); return !MID_lt(mid, sctp_mid_peek(stream, in, sid)); } static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->reasm); if (!pos) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) { loc = pos; break; } if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); else __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream || cevent->mid != sin->mid) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (cevent->mid == sin->mid) { pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, pd_first, pd_last); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode && event->mid == sin->mid && event->fsn == sin->fsn) retval = sctp_intl_retrieve_partial(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled(ulpq, event); return retval; } static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->lobby); if (!pos) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } cevent = (struct sctp_ulpevent *)pos->cb; if (event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } if (event->stream > cevent->stream) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->lobby, pos) { cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > event->stream) { loc = pos; break; } if (cevent->stream == event->stream && MID_lt(event->mid, cevent->mid)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); else __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event)); } static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head *event_list; struct sctp_stream *stream; struct sk_buff *pos, *tmp; __u16 sid = event->stream; stream = &ulpq->asoc->stream; event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev; sctp_skb_for_each(pos, &ulpq->lobby, tmp) { struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > sid) break; if (cevent->stream < sid) continue; if (cevent->mid != sctp_mid_peek(stream, in, sid)) break; sctp_mid_next(stream, in, sid); __skb_unlink(pos, &ulpq->lobby); __skb_queue_tail(event_list, pos); } } static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_stream *stream; __u16 sid; stream = &ulpq->asoc->stream; sid = event->stream; if (event->mid != sctp_mid_peek(stream, in, sid)) { sctp_intl_store_ordered(ulpq, event); return NULL; } sctp_mid_next(stream, in, sid); sctp_intl_retrieve_ordered(ulpq, event); return event; } static int sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); struct sctp_ulpevent *event; struct sk_buff *skb; skb = __skb_peek(skb_list); event = sctp_skb2event(skb); if (sk->sk_shutdown & RCV_SHUTDOWN && (sk->sk_shutdown & SEND_SHUTDOWN || !sctp_ulpevent_is_notification(event))) goto out_free; if (!sctp_ulpevent_is_notification(event)) { sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; skb_queue_splice_tail_init(skb_list, &sk->sk_receive_queue); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } return 1; out_free: sctp_queue_purge_ulpevents(skb_list); return 0; } static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos; pos = skb_peek_tail(&ulpq->reasm_uo); if (!pos) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } skb_queue_walk(&ulpq->reasm_uo, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) break; if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) break; } __skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, sin->mid_uo)) continue; if (MID_lt(sin->mid_uo, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode_uo = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (!sin->pd_mode_uo) { sin->mid_uo = cevent->mid; pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, pd_first, pd_last); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm_uo(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode_uo && event->mid == sin->mid_uo && event->fsn == sin->fsn_uo) retval = sctp_intl_retrieve_partial_uo(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled_uo(ulpq, event); return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode_uo) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; sin->mid_uo = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid_uo && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } return retval; } static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; int event_eor = 0; event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); if (!event) return -ENOMEM; event->mid = ntohl(chunk->subh.idata_hdr->mid); if (event->msg_flags & SCTP_DATA_FIRST_FRAG) event->ppid = chunk->subh.idata_hdr->ppid; else event->fsn = ntohl(chunk->subh.idata_hdr->fsn); if (!(event->msg_flags & SCTP_DATA_UNORDERED)) { event = sctp_intl_reasm(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); if (event->msg_flags & MSG_EOR) event = sctp_intl_order(ulpq, event); } } else { event = sctp_intl_reasm_uo(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); } } if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; sctp_enqueue_event(ulpq, &temp); } return event_eor; } static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; if (cevent->mid == csin->mid) { first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; } break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } return retval; } static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; if (!skb_queue_empty(&ulpq->reasm)) { do { event = sctp_intl_retrieve_first(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } if (!skb_queue_empty(&ulpq->reasm_uo)) { do { event = sctp_intl_retrieve_first_uo(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } } static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc = ulpq->asoc; __u32 freed = 0; __u16 needed; needed = ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_idata_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo, needed); } if (freed >= needed && sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0) sctp_intl_start_pd(ulpq, gfp); } static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u16 flags, gfp_t gfp) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_ulpevent *ev = NULL; if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, SCTP_PARTIAL_DELIVERY_EVENT)) return; ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, sid, mid, flags, gfp); if (ev) { struct sctp_sock *sp = sctp_sk(sk); __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } } } static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) { struct sctp_stream *stream = &ulpq->asoc->stream; struct sctp_ulpevent *cevent, *event = NULL; struct sk_buff_head *lobby = &ulpq->lobby; struct sk_buff *pos, *tmp; struct sk_buff_head temp; __u16 csid; __u32 cmid; skb_queue_head_init(&temp); sctp_skb_for_each(pos, lobby, tmp) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid > sid) break; if (csid < sid) continue; if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid))) break; __skb_unlink(pos, lobby); if (!event) event = sctp_skb2event(pos); __skb_queue_tail(&temp, pos); } if (!event && pos != (struct sk_buff *)lobby) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) { sctp_mid_next(stream, in, csid); __skb_unlink(pos, lobby); __skb_queue_tail(&temp, pos); event = sctp_skb2event(pos); } } if (event) { sctp_intl_retrieve_ordered(ulpq, event); sctp_enqueue_event(ulpq, &temp); } } static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_stream *stream = &ulpq->asoc->stream; __u16 sid; for (sid = 0; sid < stream->incnt; sid++) { struct sctp_stream_in *sin = SCTP_SI(stream, sid); __u32 mid; if (sin->pd_mode_uo) { sin->pd_mode_uo = 0; mid = sin->mid_uo; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp); } if (sin->pd_mode) { sin->pd_mode = 0; mid = sin->mid; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp); sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } } /* intl abort pd happens only when all data needs to be cleaned */ sctp_ulpq_flush(ulpq); } static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist, int nskips, __be16 stream, __u8 flags) { int i; for (i = 0; i < nskips; i++) if (skiplist[i].stream == stream && skiplist[i].flags == flags) return i; return i; } #define SCTP_FTSN_U_BIT 0x1 static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_ifwdtsn_skip ftsn_skip_arr[10]; struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; struct list_head *lchunk, *temp; int nskips = 0, skip_pos; struct sctp_chunk *chunk; __u32 tsn; if (!asoc->peer.prsctp_capable) return; if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) asoc->adv_peer_ack_point = ctsn; list_for_each_safe(lchunk, temp, &q->abandoned) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(chunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(lchunk); sctp_chunk_free(chunk); } else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) { __be16 sid = chunk->subh.idata_hdr->stream; __be32 mid = chunk->subh.idata_hdr->mid; __u8 flags = 0; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) flags |= SCTP_FTSN_U_BIT; asoc->adv_peer_ack_point = tsn; skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, sid, flags); ftsn_skip_arr[skip_pos].stream = sid; ftsn_skip_arr[skip_pos].reserved = 0; ftsn_skip_arr[skip_pos].flags = flags; ftsn_skip_arr[skip_pos].mid = mid; if (skip_pos == nskips) nskips++; if (nskips == 10) break; } else { break; } } if (asoc->adv_peer_ack_point > ctsn) ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point, nskips, &ftsn_skip_arr[0]); if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } #define _sctp_walk_ifwdtsn(pos, chunk, end) \ for (pos = (void *)(chunk->subh.ifwdtsn_hdr + 1); \ (void *)pos <= (void *)(chunk->subh.ifwdtsn_hdr + 1) + (end) - \ sizeof(struct sctp_ifwdtsn_skip); pos++) #define sctp_walk_ifwdtsn(pos, ch) \ _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \ sizeof(struct sctp_ifwdtsn_chunk)) static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_fwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static bool sctp_validate_iftsn(struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_ifwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_ulpq_reasm_flushtsn(ulpq, ftsn); /* Abort any in progress partial delivery. */ sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { struct sk_buff *pos, *tmp; skb_queue_walk_safe(&ulpq->reasm, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm); sctp_ulpevent_free(event); } } skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm_uo); sctp_ulpevent_free(event); } } } static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_intl_reasm_flushtsn(ulpq, ftsn); /* abort only when it's for all data */ if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map)) sctp_intl_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; /* Walk through all the skipped SSNs */ sctp_walk_fwdtsn(skip, chunk) sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); } static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u8 flags) { struct sctp_stream_in *sin = sctp_stream_in(&ulpq->asoc->stream, sid); struct sctp_stream *stream = &ulpq->asoc->stream; if (flags & SCTP_FTSN_U_BIT) { if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) { sin->pd_mode_uo = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, GFP_ATOMIC); } return; } if (MID_lt(mid, sctp_mid_peek(stream, in, sid))) return; if (sin->pd_mode) { sin->pd_mode = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC); } sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; /* Walk through all the skipped MIDs and abort stream pd if possible */ sctp_walk_ifwdtsn(skip, chunk) sctp_intl_skip(ulpq, ntohs(skip->stream), ntohl(skip->mid), skip->flags); } static int do_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_ulpq_tail_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_0 = { .data_chunk_len = sizeof(struct sctp_data_chunk), .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk), /* DATA process functions */ .make_datafrag = sctp_make_datafrag_empty, .assign_number = sctp_chunk_assign_ssn, .validate_data = sctp_validate_data, .ulpevent_data = sctp_ulpq_tail_data, .enqueue_event = do_ulpq_tail_event, .renege_events = sctp_ulpq_renege, .start_pd = sctp_ulpq_partial_delivery, .abort_pd = sctp_ulpq_abort_pd, /* FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_fwdtsn, .validate_ftsn = sctp_validate_fwdtsn, .report_ftsn = sctp_report_fwdtsn, .handle_ftsn = sctp_handle_fwdtsn, }; static int do_sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_enqueue_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_1 = { .data_chunk_len = sizeof(struct sctp_idata_chunk), .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk), /* I-DATA process functions */ .make_datafrag = sctp_make_idatafrag_empty, .assign_number = sctp_chunk_assign_mid, .validate_data = sctp_validate_idata, .ulpevent_data = sctp_ulpevent_idata, .enqueue_event = do_sctp_enqueue_event, .renege_events = sctp_renege_events, .start_pd = sctp_intl_start_pd, .abort_pd = sctp_intl_abort_pd, /* I-FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_iftsn, .validate_ftsn = sctp_validate_iftsn, .report_ftsn = sctp_report_iftsn, .handle_ftsn = sctp_handle_iftsn, }; void sctp_stream_interleave_init(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); stream->si = asoc->peer.intl_capable ? &sctp_stream_interleave_1 : &sctp_stream_interleave_0; } |
23 4 4 4 23 23 23 29 28 1 1 23 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 20 19 23 23 23 23 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 23 23 1 23 23 23 23 23 23 23 23 20 3 23 23 23 23 23 23 23 23 23 3 20 20 23 23 23 1 1 1 1 23 22 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 28 25 4 2 23 23 23 23 23 1 23 23 29 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 | // SPDX-License-Identifier: GPL-2.0 /* * bcachefs setup/teardown code, and some metadata io - read a superblock and * figure out what to do with it. * * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc. */ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" #include "bkey_sort.h" #include "btree_cache.h" #include "btree_gc.h" #include "btree_journal_iter.h" #include "btree_key_cache.h" #include "btree_node_scan.h" #include "btree_update_interior.h" #include "btree_io.h" #include "btree_write_buffer.h" #include "buckets_waiting_for_journal.h" #include "chardev.h" #include "checksum.h" #include "clock.h" #include "compress.h" #include "debug.h" #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" #include "errcode.h" #include "error.h" #include "fs.h" #include "fs-io.h" #include "fs-io-buffered.h" #include "fs-io-direct.h" #include "fsck.h" #include "inode.h" #include "io_read.h" #include "io_write.h" #include "journal.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "move.h" #include "migrate.h" #include "movinggc.h" #include "nocow_locking.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" #include "replicas.h" #include "sb-clean.h" #include "sb-counters.h" #include "sb-errors.h" #include "sb-members.h" #include "snapshot.h" #include "subvolume.h" #include "super.h" #include "super-io.h" #include "sysfs.h" #include "thread_with_file.h" #include "trace.h" #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/debugfs.h> #include <linux/device.h> #include <linux/idr.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/random.h> #include <linux/sysfs.h> #include <crypto/hash.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); MODULE_DESCRIPTION("bcachefs filesystem"); MODULE_SOFTDEP("pre: crc32c"); MODULE_SOFTDEP("pre: crc64"); MODULE_SOFTDEP("pre: sha256"); MODULE_SOFTDEP("pre: chacha20"); MODULE_SOFTDEP("pre: poly1305"); MODULE_SOFTDEP("pre: xxhash"); const char * const bch2_fs_flag_strs[] = { #define x(n) #n, BCH_FS_FLAGS() #undef x NULL }; void bch2_print_str(struct bch_fs *c, const char *str) { #ifdef __KERNEL__ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); if (unlikely(stdio)) { bch2_stdio_redirect_printf(stdio, true, "%s", str); return; } #endif bch2_print_string_as_lines(KERN_ERR, str); } __printf(2, 0) static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) { #ifdef __KERNEL__ if (unlikely(stdio)) { if (fmt[0] == KERN_SOH[0]) fmt += 2; bch2_stdio_redirect_vprintf(stdio, true, fmt, args); return; } #endif vprintk(fmt, args); } void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) { struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; va_list args; va_start(args, fmt); bch2_print_maybe_redirect(stdio, fmt, args); va_end(args); } void __bch2_print(struct bch_fs *c, const char *fmt, ...) { struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); va_list args; va_start(args, fmt); bch2_print_maybe_redirect(stdio, fmt, args); va_end(args); } #define KTYPE(type) \ static const struct attribute_group type ## _group = { \ .attrs = type ## _files \ }; \ \ static const struct attribute_group *type ## _groups[] = { \ &type ## _group, \ NULL \ }; \ \ static const struct kobj_type type ## _ktype = { \ .release = type ## _release, \ .sysfs_ops = &type ## _sysfs_ops, \ .default_groups = type ## _groups \ } static void bch2_fs_release(struct kobject *); static void bch2_dev_release(struct kobject *); static void bch2_fs_counters_release(struct kobject *k) { } static void bch2_fs_internal_release(struct kobject *k) { } static void bch2_fs_opts_dir_release(struct kobject *k) { } static void bch2_fs_time_stats_release(struct kobject *k) { } KTYPE(bch2_fs); KTYPE(bch2_fs_counters); KTYPE(bch2_fs_internal); KTYPE(bch2_fs_opts_dir); KTYPE(bch2_fs_time_stats); KTYPE(bch2_dev); static struct kset *bcachefs_kset; static LIST_HEAD(bch_fs_list); static DEFINE_MUTEX(bch_fs_list_lock); DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); static void bch2_dev_unlink(struct bch_dev *); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { struct bch_fs *c; mutex_lock(&bch_fs_list_lock); rcu_read_lock(); list_for_each_entry(c, &bch_fs_list, list) for_each_member_device_rcu(c, ca, NULL) if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { closure_get(&c->cl); goto found; } c = NULL; found: rcu_read_unlock(); mutex_unlock(&bch_fs_list_lock); return c; } static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) { struct bch_fs *c; lockdep_assert_held(&bch_fs_list_lock); list_for_each_entry(c, &bch_fs_list, list) if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) return c; return NULL; } struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) { struct bch_fs *c; mutex_lock(&bch_fs_list_lock); c = __bch2_uuid_to_fs(uuid); if (c) closure_get(&c->cl); mutex_unlock(&bch_fs_list_lock); return c; } /* Filesystem RO/RW: */ /* * For startup/shutdown of RW stuff, the dependencies are: * * - foreground writes depend on copygc and rebalance (to free up space) * * - copygc and rebalance depend on mark and sweep gc (they actually probably * don't because they either reserve ahead of time or don't block if * allocations fail, but allocations can require mark and sweep gc to run * because of generation number wraparound) * * - all of the above depends on the allocator threads * * - allocator depends on the journal (when it rewrites prios and gens) */ static void __bch2_fs_read_only(struct bch_fs *c) { unsigned clean_passes = 0; u64 seq = 0; bch2_fs_ec_stop(c); bch2_open_buckets_stop(c, NULL, true); bch2_rebalance_stop(c); bch2_copygc_stop(c); bch2_fs_ec_flush(c); bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", journal_cur_seq(&c->journal)); do { clean_passes++; if (bch2_btree_interior_updates_flush(c) || bch2_btree_write_buffer_flush_going_ro(c) || bch2_journal_flush_all_pins(&c->journal) || bch2_btree_flush_all_writes(c) || seq != atomic64_read(&c->journal.seq)) { seq = atomic64_read(&c->journal.seq); clean_passes = 0; } } while (clean_passes < 2); bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", journal_cur_seq(&c->journal)); if (test_bit(JOURNAL_replay_done, &c->journal.flags) && !test_bit(BCH_FS_emergency_ro, &c->flags)) set_bit(BCH_FS_clean_shutdown, &c->flags); bch2_fs_journal_stop(&c->journal); bch_info(c, "%sshutdown complete, journal seq %llu", test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", c->journal.seq_ondisk); /* * After stopping journal: */ for_each_member_device(c, ca) bch2_dev_allocator_remove(c, ca); } #ifndef BCH_WRITE_REF_DEBUG static void bch2_writes_disabled(struct percpu_ref *writes) { struct bch_fs *c = container_of(writes, struct bch_fs, writes); set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); } #endif void bch2_fs_read_only(struct bch_fs *c) { if (!test_bit(BCH_FS_rw, &c->flags)) { bch2_journal_reclaim_stop(&c->journal); return; } BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); bch_verbose(c, "going read-only"); /* * Block new foreground-end write operations from starting - any new * writes will return -EROFS: */ set_bit(BCH_FS_going_ro, &c->flags); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_kill(&c->writes); #else for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) bch2_write_ref_put(c, i); #endif /* * If we're not doing an emergency shutdown, we want to wait on * outstanding writes to complete so they don't see spurious errors due * to shutting down the allocator: * * If we are doing an emergency shutdown outstanding writes may * hang until we shutdown the allocator so we don't want to wait * on outstanding writes before shutting everything down - but * we do need to wait on them before returning and signalling * that going RO is complete: */ wait_event(bch2_read_only_wait, test_bit(BCH_FS_write_disable_complete, &c->flags) || test_bit(BCH_FS_emergency_ro, &c->flags)); bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); if (writes_disabled) bch_verbose(c, "finished waiting for writes to stop"); __bch2_fs_read_only(c); wait_event(bch2_read_only_wait, test_bit(BCH_FS_write_disable_complete, &c->flags)); if (!writes_disabled) bch_verbose(c, "finished waiting for writes to stop"); clear_bit(BCH_FS_write_disable_complete, &c->flags); clear_bit(BCH_FS_going_ro, &c->flags); clear_bit(BCH_FS_rw, &c->flags); if (!bch2_journal_error(&c->journal) && !test_bit(BCH_FS_error, &c->flags) && !test_bit(BCH_FS_emergency_ro, &c->flags) && test_bit(BCH_FS_started, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) && c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); BUG_ON(c->btree_write_buffer.inc.keys.nr); BUG_ON(c->btree_write_buffer.flushing.keys.nr); bch2_verify_accounting_clean(c); bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); } else { bch_verbose(c, "done going read-only, filesystem not clean"); } } static void bch2_fs_read_only_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, read_only_work); down_write(&c->state_lock); bch2_fs_read_only(c); up_write(&c->state_lock); } static void bch2_fs_read_only_async(struct bch_fs *c) { queue_work(system_long_wq, &c->read_only_work); } bool bch2_fs_emergency_read_only(struct bch_fs *c) { bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); bch2_journal_halt(&c->journal); bch2_fs_read_only_async(c); wake_up(&bch2_read_only_wait); return ret; } static int bch2_fs_read_write_late(struct bch_fs *c) { int ret; /* * Data move operations can't run until after check_snapshots has * completed, and bch2_snapshot_is_ancestor() is available. * * Ideally we'd start copygc/rebalance earlier instead of waiting for * all of recovery/fsck to complete: */ ret = bch2_copygc_start(c); if (ret) { bch_err(c, "error starting copygc thread"); return ret; } ret = bch2_rebalance_start(c); if (ret) { bch_err(c, "error starting rebalance thread"); return ret; } return 0; } static int __bch2_fs_read_write(struct bch_fs *c, bool early) { int ret; if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); return -BCH_ERR_erofs_unfixed_errors; } if (test_bit(BCH_FS_rw, &c->flags)) return 0; bch_info(c, "going read-write"); ret = bch2_sb_members_v2_init(c); if (ret) goto err; ret = bch2_fs_mark_dirty(c); if (ret) goto err; clear_bit(BCH_FS_clean_shutdown, &c->flags); /* * First journal write must be a flush write: after a clean shutdown we * don't read the journal, so the first journal write may end up * overwriting whatever was there previously, and there must always be * at least one non-flush write in the journal or recovery will fail: */ set_bit(JOURNAL_need_flush_write, &c->journal.flags); set_bit(JOURNAL_running, &c->journal.flags); for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); set_bit(BCH_FS_rw, &c->flags); set_bit(BCH_FS_was_rw, &c->flags); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_reinit(&c->writes); #else for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { BUG_ON(atomic_long_read(&c->writes[i])); atomic_long_inc(&c->writes[i]); } #endif ret = bch2_journal_reclaim_start(&c->journal); if (ret) goto err; if (!early) { ret = bch2_fs_read_write_late(c); if (ret) goto err; } bch2_do_discards(c); bch2_do_invalidates(c); bch2_do_stripe_deletes(c); bch2_do_pending_node_rewrites(c); return 0; err: if (test_bit(BCH_FS_rw, &c->flags)) bch2_fs_read_only(c); else __bch2_fs_read_only(c); return ret; } int bch2_fs_read_write(struct bch_fs *c) { if (c->opts.recovery_pass_last && c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) return -BCH_ERR_erofs_norecovery; if (c->opts.nochanges) return -BCH_ERR_erofs_nochanges; return __bch2_fs_read_write(c, false); } int bch2_fs_read_write_early(struct bch_fs *c) { lockdep_assert_held(&c->state_lock); return __bch2_fs_read_write(c, true); } /* Filesystem startup/shutdown: */ static void __bch2_fs_free(struct bch_fs *c) { for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); bch2_fs_accounting_exit(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); bch2_fs_quota_exit(c); bch2_fs_fs_io_direct_exit(c); bch2_fs_fs_io_buffered_exit(c); bch2_fs_fsio_exit(c); bch2_fs_vfs_exit(c); bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_nocow_locking_exit(c); bch2_fs_io_write_exit(c); bch2_fs_io_read_exit(c); bch2_fs_buckets_waiting_for_journal_exit(c); bch2_fs_btree_interior_update_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); bch2_fs_btree_cache_exit(c); bch2_fs_btree_iter_exit(c); bch2_fs_replicas_exit(c); bch2_fs_journal_exit(&c->journal); bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); bch2_journal_keys_put_initial(c); bch2_find_btree_nodes_exit(&c->found_btree_nodes); BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); if (c->online_reserved) { u64 v = percpu_u64_get(c->online_reserved); WARN(v, "online_reserved not 0 at shutdown: %lli", v); free_percpu(c->online_reserved); } darray_exit(&c->btree_roots_extra); free_percpu(c->pcpu); free_percpu(c->usage); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->fill_iter); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_exit(&c->writes); #endif kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); kfree(c->unused_inode_hints); if (c->write_ref_wq) destroy_workqueue(c->write_ref_wq); if (c->btree_write_submit_wq) destroy_workqueue(c->btree_write_submit_wq); if (c->btree_read_complete_wq) destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); if (c->btree_io_complete_wq) destroy_workqueue(c->btree_io_complete_wq); if (c->btree_update_wq) destroy_workqueue(c->btree_update_wq); bch2_free_super(&c->disk_sb); kvfree(c); module_put(THIS_MODULE); } static void bch2_fs_release(struct kobject *kobj) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); __bch2_fs_free(c); } void __bch2_fs_stop(struct bch_fs *c) { bch_verbose(c, "shutting down"); set_bit(BCH_FS_stopping, &c->flags); down_write(&c->state_lock); bch2_fs_read_only(c); up_write(&c->state_lock); for_each_member_device(c, ca) bch2_dev_unlink(ca); if (c->kobj.state_in_sysfs) kobject_del(&c->kobj); bch2_fs_debug_exit(c); bch2_fs_chardev_exit(c); bch2_ro_ref_put(c); wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); kobject_put(&c->counters_kobj); kobject_put(&c->time_stats); kobject_put(&c->opts_dir); kobject_put(&c->internal); /* btree prefetch might have kicked off reads in the background: */ bch2_btree_flush_all_reads(c); for_each_member_device(c, ca) cancel_work_sync(&ca->io_error_work); cancel_work_sync(&c->read_only_work); } void bch2_fs_free(struct bch_fs *c) { unsigned i; mutex_lock(&bch_fs_list_lock); list_del(&c->list); mutex_unlock(&bch_fs_list_lock); closure_sync(&c->cl); closure_debug_destroy(&c->cl); for (i = 0; i < c->sb.nr_devices; i++) { struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); if (ca) { EBUG_ON(atomic_long_read(&ca->ref) != 1); bch2_free_super(&ca->disk_sb); bch2_dev_free(ca); } } bch_verbose(c, "shutdown complete"); kobject_put(&c->kobj); } void bch2_fs_stop(struct bch_fs *c) { __bch2_fs_stop(c); bch2_fs_free(c); } static int bch2_fs_online(struct bch_fs *c) { int ret = 0; lockdep_assert_held(&bch_fs_list_lock); if (__bch2_uuid_to_fs(c->sb.uuid)) { bch_err(c, "filesystem UUID already open"); return -EINVAL; } ret = bch2_fs_chardev_init(c); if (ret) { bch_err(c, "error creating character device"); return ret; } bch2_fs_debug_init(c); ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: #endif kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: bch2_opts_create_sysfs_files(&c->opts_dir); if (ret) { bch_err(c, "error creating sysfs objects"); return ret; } down_write(&c->state_lock); for_each_member_device(c, ca) { ret = bch2_dev_sysfs_online(c, ca); if (ret) { bch_err(c, "error creating sysfs objects"); bch2_dev_put(ca); goto err; } } BUG_ON(!list_empty(&c->list)); list_add(&c->list, &bch_fs_list); err: up_write(&c->state_lock); return ret; } static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) { struct bch_fs *c; struct printbuf name = PRINTBUF; unsigned i, iter_size; int ret = 0; c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); if (!c) { c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); goto out; } c->stdio = (void *)(unsigned long) opts.stdio; __module_get(THIS_MODULE); closure_init(&c->cl, NULL); c->kobj.kset = bcachefs_kset; kobject_init(&c->kobj, &bch2_fs_ktype); kobject_init(&c->internal, &bch2_fs_internal_ktype); kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); c->minor = -1; c->disk_sb.fs_sb = true; init_rwsem(&c->state_lock); mutex_init(&c->sb_lock); mutex_init(&c->replicas_gc_lock); mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); refcount_set(&c->ro_ref, 1); init_waitqueue_head(&c->ro_ref_wait); sema_init(&c->online_fsck_mutex, 1); init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); atomic_set(&c->journal_keys.ref, 1); c->journal_keys.initial_ref_held = true; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); bch2_fs_gc_init(c); bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); bch2_fs_btree_iter_init_early(c); bch2_fs_btree_interior_update_init_early(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); bch2_fs_rebalance_init(c); bch2_fs_quota_init(c); bch2_fs_ec_init_early(c); bch2_fs_move_init(c); bch2_fs_sb_errors_init_early(c); INIT_LIST_HEAD(&c->list); mutex_init(&c->bio_bounce_pages_lock); mutex_init(&c->snapshot_table_lock); init_rwsem(&c->snapshot_create_lock); spin_lock_init(&c->btree_write_error_lock); INIT_LIST_HEAD(&c->journal_iters); INIT_LIST_HEAD(&c->fsck_error_msgs); mutex_init(&c->fsck_error_msgs_lock); seqcount_init(&c->usage_lock); sema_init(&c->io_in_flight, 128); INIT_LIST_HEAD(&c->vfs_inodes_list); mutex_init(&c->vfs_inodes_lock); c->copy_gc_enabled = 1; c->rebalance.enabled = 1; c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; bch2_fs_btree_cache_init_early(&c->btree_cache); mutex_init(&c->sectors_available_lock); ret = percpu_init_rwsem(&c->mark_lock); if (ret) goto err; mutex_lock(&c->sb_lock); ret = bch2_sb_to_fs(c, sb); mutex_unlock(&c->sb_lock); if (ret) goto err; pr_uuid(&name, c->sb.user_uuid.b); ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; if (ret) goto err; strscpy(c->name, name.buf, sizeof(c->name)); printbuf_exit(&name); /* Compat: */ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); c->opts = bch2_opts_default; ret = bch2_opts_from_sb(&c->opts, sb); if (ret) goto err; bch2_opts_apply(&c->opts, opts); c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; c->block_bits = ilog2(block_sectors(c)); c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); if (bch2_fs_init_fault("fs_alloc")) { bch_err(c, "fs_alloc fault injected"); ret = -EFAULT; goto err; } iter_size = sizeof(struct sort_iter) + (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); if (!(c->btree_update_wq = alloc_workqueue("bcachefs", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || #endif mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->btree_bio, 1, max(offsetof(struct btree_read_bio, bio), offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->usage = alloc_percpu(struct bch_fs_usage_base)) || !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, sizeof(u64), GFP_KERNEL))) { ret = -BCH_ERR_ENOMEM_fs_other_alloc; goto err; } ret = bch2_fs_counters_init(c) ?: bch2_fs_sb_errors_init(c) ?: bch2_io_clock_init(&c->io_clock[READ]) ?: bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_journal_init(&c->journal) ?: bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_cache_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: bch2_fs_btree_interior_update_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: bch2_fs_subvolumes_init(c) ?: bch2_fs_io_read_init(c) ?: bch2_fs_io_write_init(c) ?: bch2_fs_nocow_locking_init(c) ?: bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: bch2_fs_ec_init(c) ?: bch2_fs_vfs_init(c) ?: bch2_fs_fsio_init(c) ?: bch2_fs_fs_io_buffered_init(c) ?: bch2_fs_fs_io_direct_init(c); if (ret) goto err; for (i = 0; i < c->sb.nr_devices; i++) { if (!bch2_member_exists(c->disk_sb.sb, i)) continue; ret = bch2_dev_alloc(c, i); if (ret) goto err; } bch2_journal_entry_res_resize(&c->journal, &c->btree_root_journal_res, BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); bch2_journal_entry_res_resize(&c->journal, &c->clock_journal_res, (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); mutex_lock(&bch_fs_list_lock); ret = bch2_fs_online(c); mutex_unlock(&bch_fs_list_lock); if (ret) goto err; out: return c; err: bch2_fs_free(c); c = ERR_PTR(ret); goto out; } noinline_for_stack static void print_mount_opts(struct bch_fs *c) { enum bch_opt_id i; struct printbuf p = PRINTBUF; bool first = true; prt_str(&p, "starting version "); bch2_version_to_text(&p, c->sb.version); if (c->opts.read_only) { prt_str(&p, " opts="); first = false; prt_printf(&p, "ro"); } for (i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); if (!(opt->flags & OPT_MOUNT)) continue; if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) continue; prt_str(&p, first ? " opts=" : ","); first = false; bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); } bch_info(c, "%s", p.buf); printbuf_exit(&p); } int bch2_fs_start(struct bch_fs *c) { time64_t now = ktime_get_real_seconds(); int ret; print_mount_opts(c); down_write(&c->state_lock); BUG_ON(test_bit(BCH_FS_started, &c->flags)); mutex_lock(&c->sb_lock); ret = bch2_sb_members_v2_init(c); if (ret) { mutex_unlock(&c->sb_lock); goto err; } for_each_online_member(c, ca) bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); struct bch_sb_field_ext *ext = bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); mutex_unlock(&c->sb_lock); if (!ext) { bch_err(c, "insufficient space in superblock for sb_field_ext"); ret = -BCH_ERR_ENOSPC_sb; goto err; } for_each_rw_member(c, ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ? bch2_fs_recovery(c) : bch2_fs_initialize(c); if (ret) goto err; ret = bch2_opts_check_may_set(c); if (ret) goto err; if (bch2_fs_init_fault("fs_start")) { bch_err(c, "fs_start fault injected"); ret = -EINVAL; goto err; } set_bit(BCH_FS_started, &c->flags); if (c->opts.read_only) { bch2_fs_read_only(c); } else { ret = !test_bit(BCH_FS_rw, &c->flags) ? bch2_fs_read_write(c) : bch2_fs_read_write_late(c); if (ret) goto err; } ret = 0; err: if (ret) bch_err_msg(c, ret, "starting filesystem"); else bch_verbose(c, "done starting filesystem"); up_write(&c->state_lock); return ret; } static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) { struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); if (le16_to_cpu(sb->block_size) != block_sectors(c)) return -BCH_ERR_mismatched_block_size; if (le16_to_cpu(m.bucket_size) < BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) return -BCH_ERR_bucket_size_too_small; return 0; } static int bch2_dev_in_fs(struct bch_sb_handle *fs, struct bch_sb_handle *sb, struct bch_opts *opts) { if (fs == sb) return 0; if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) return -BCH_ERR_device_not_a_member_of_filesystem; if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) return -BCH_ERR_device_has_been_removed; if (fs->sb->block_size != sb->sb->block_size) return -BCH_ERR_mismatched_block_size; if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) return 0; if (fs->sb->seq == sb->sb->seq && fs->sb->write_time != sb->sb->write_time) { struct printbuf buf = PRINTBUF; prt_str(&buf, "Split brain detected between "); prt_bdevname(&buf, sb->bdev); prt_str(&buf, " and "); prt_bdevname(&buf, fs->bdev); prt_char(&buf, ':'); prt_newline(&buf); prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); prt_newline(&buf); prt_bdevname(&buf, fs->bdev); prt_char(&buf, ' '); bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));; prt_newline(&buf); prt_bdevname(&buf, sb->bdev); prt_char(&buf, ' '); bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; prt_newline(&buf); if (!opts->no_splitbrain_check) prt_printf(&buf, "Not using older sb"); pr_err("%s", buf.buf); printbuf_exit(&buf); if (!opts->no_splitbrain_check) return -BCH_ERR_device_splitbrain; } struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); u64 seq_from_fs = le64_to_cpu(m.seq); u64 seq_from_member = le64_to_cpu(sb->sb->seq); if (seq_from_fs && seq_from_fs < seq_from_member) { struct printbuf buf = PRINTBUF; prt_str(&buf, "Split brain detected between "); prt_bdevname(&buf, sb->bdev); prt_str(&buf, " and "); prt_bdevname(&buf, fs->bdev); prt_char(&buf, ':'); prt_newline(&buf); prt_bdevname(&buf, fs->bdev); prt_str(&buf, " believes seq of "); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " to be %llu, but ", seq_from_fs); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " has %llu\n", seq_from_member); if (!opts->no_splitbrain_check) { prt_str(&buf, "Not using "); prt_bdevname(&buf, sb->bdev); } pr_err("%s", buf.buf); printbuf_exit(&buf); if (!opts->no_splitbrain_check) return -BCH_ERR_device_splitbrain; } return 0; } /* Device startup/shutdown: */ static void bch2_dev_release(struct kobject *kobj) { struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); kfree(ca); } static void bch2_dev_free(struct bch_dev *ca) { cancel_work_sync(&ca->io_error_work); bch2_dev_unlink(ca); if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); bch2_free_super(&ca->disk_sb); bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); free_percpu(ca->io_done); bch2_dev_buckets_free(ca); free_page((unsigned long) ca->sb_read_scratch); bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); percpu_ref_exit(&ca->io_ref); #ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_exit(&ca->ref); #endif kobject_put(&ca->kobj); } static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); if (percpu_ref_is_zero(&ca->io_ref)) return; __bch2_dev_read_only(c, ca); reinit_completion(&ca->io_ref_completion); percpu_ref_kill(&ca->io_ref); wait_for_completion(&ca->io_ref_completion); bch2_dev_unlink(ca); bch2_free_super(&ca->disk_sb); bch2_dev_journal_exit(ca); } #ifndef CONFIG_BCACHEFS_DEBUG static void bch2_dev_ref_complete(struct percpu_ref *ref) { struct bch_dev *ca = container_of(ref, struct bch_dev, ref); complete(&ca->ref_completion); } #endif static void bch2_dev_io_ref_complete(struct percpu_ref *ref) { struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); complete(&ca->io_ref_completion); } static void bch2_dev_unlink(struct bch_dev *ca) { struct kobject *b; /* * This is racy w.r.t. the underlying block device being hot-removed, * which removes it from sysfs. * * It'd be lovely if we had a way to handle this race, but the sysfs * code doesn't appear to provide a good method and block/holder.c is * susceptible as well: */ if (ca->kobj.state_in_sysfs && ca->disk_sb.bdev && (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { sysfs_remove_link(b, "bcachefs"); sysfs_remove_link(&ca->kobj, "block"); } } static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) { int ret; if (!c->kobj.state_in_sysfs) return 0; if (!ca->kobj.state_in_sysfs) { ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx); if (ret) return ret; } if (ca->disk_sb.bdev) { struct kobject *block = bdev_kobj(ca->disk_sb.bdev); ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); if (ret) return ret; ret = sysfs_create_link(&ca->kobj, block, "block"); if (ret) return ret; } return 0; } static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, struct bch_member *member) { struct bch_dev *ca; unsigned i; ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) return NULL; kobject_init(&ca->kobj, &bch2_dev_ktype); init_completion(&ca->ref_completion); init_completion(&ca->io_ref_completion); init_rwsem(&ca->bucket_lock); INIT_WORK(&ca->io_error_work, bch2_io_error_work); bch2_time_stats_quantiles_init(&ca->io_latency[READ]); bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); ca->mi = bch2_mi_to_cpu(member); for (i = 0; i < ARRAY_SIZE(member->errors); i++) atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); ca->uuid = member->uuid; ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ca->mi.bucket_size / btree_sectors(c)); #ifndef CONFIG_BCACHEFS_DEBUG if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) goto err; #else atomic_long_set(&ca->ref, 1); #endif bch2_dev_allocator_background_init(ca); if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || !(ca->io_done = alloc_percpu(*ca->io_done))) goto err; return ca; err: bch2_dev_free(ca); return NULL; } static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) { ca->dev_idx = dev_idx; __set_bit(ca->dev_idx, ca->self.d); scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ca->fs = c; rcu_assign_pointer(c->devs[ca->dev_idx], ca); if (bch2_dev_sysfs_online(c, ca)) pr_warn("error creating sysfs objects"); } static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) { struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); struct bch_dev *ca = NULL; int ret = 0; if (bch2_fs_init_fault("dev_alloc")) goto err; ca = __bch2_dev_alloc(c, &member); if (!ca) goto err; ca->fs = c; bch2_dev_attach(c, ca, dev_idx); return ret; err: if (ca) bch2_dev_free(ca); return -BCH_ERR_ENOMEM_dev_alloc; } static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) { unsigned ret; if (bch2_dev_is_online(ca)) { bch_err(ca, "already have device online in slot %u", sb->sb->dev_idx); return -BCH_ERR_device_already_online; } if (get_capacity(sb->bdev->bd_disk) < ca->mi.bucket_size * ca->mi.nbuckets) { bch_err(ca, "cannot online: device too small"); return -BCH_ERR_device_size_too_small; } BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); ret = bch2_dev_journal_init(ca, sb->sb); if (ret) return ret; /* Commit: */ ca->disk_sb = *sb; memset(sb, 0, sizeof(*sb)); ca->dev = ca->disk_sb.bdev->bd_dev; percpu_ref_reinit(&ca->io_ref); return 0; } static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) { struct bch_dev *ca; int ret; lockdep_assert_held(&c->state_lock); if (le64_to_cpu(sb->sb->seq) > le64_to_cpu(c->disk_sb.sb->seq)) bch2_sb_to_fs(c, sb->sb); BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); ca = bch2_dev_locked(c, sb->sb->dev_idx); ret = __bch2_dev_attach_bdev(ca, sb); if (ret) return ret; bch2_dev_sysfs_online(c, ca); struct printbuf name = PRINTBUF; prt_bdevname(&name, ca->disk_sb.bdev); if (c->sb.nr_devices == 1) strscpy(c->name, name.buf, sizeof(c->name)); strscpy(ca->name, name.buf, sizeof(ca->name)); printbuf_exit(&name); rebalance_wakeup(c); return 0; } /* Device management: */ /* * Note: this function is also used by the error paths - when a particular * device sees an error, we call it to determine whether we can just set the * device RO, or - if this function returns false - we'll set the whole * filesystem RO: * * XXX: maybe we should be more explicit about whether we're changing state * because we got an error or what have you? */ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { struct bch_devs_mask new_online_devs; int nr_rw = 0, required; lockdep_assert_held(&c->state_lock); switch (new_state) { case BCH_MEMBER_STATE_rw: return true; case BCH_MEMBER_STATE_ro: if (ca->mi.state != BCH_MEMBER_STATE_rw) return true; /* do we have enough devices to write to? */ for_each_member_device(c, ca2) if (ca2 != ca) nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ? c->opts.metadata_replicas : metadata_replicas_required(c), !(flags & BCH_FORCE_IF_DATA_DEGRADED) ? c->opts.data_replicas : data_replicas_required(c)); return nr_rw >= required; case BCH_MEMBER_STATE_failed: case BCH_MEMBER_STATE_spare: if (ca->mi.state != BCH_MEMBER_STATE_rw && ca->mi.state != BCH_MEMBER_STATE_ro) return true; /* do we have enough devices to read from? */ new_online_devs = bch2_online_devs(c); __clear_bit(ca->dev_idx, new_online_devs.d); return bch2_have_enough_devs(c, new_online_devs, flags, false); default: BUG(); } } static bool bch2_fs_may_start(struct bch_fs *c) { struct bch_dev *ca; unsigned i, flags = 0; if (c->opts.very_degraded) flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; if (c->opts.degraded) flags |= BCH_FORCE_IF_DEGRADED; if (!c->opts.degraded && !c->opts.very_degraded) { mutex_lock(&c->sb_lock); for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { if (!bch2_member_exists(c->disk_sb.sb, i)) continue; ca = bch2_dev_locked(c, i); if (!bch2_dev_is_online(ca) && (ca->mi.state == BCH_MEMBER_STATE_rw || ca->mi.state == BCH_MEMBER_STATE_ro)) { mutex_unlock(&c->sb_lock); return false; } } mutex_unlock(&c->sb_lock); } return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); } static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) { /* * The allocator thread itself allocates btree nodes, so stop it first: */ bch2_dev_allocator_remove(c, ca); bch2_recalc_capacity(c); bch2_dev_journal_stop(&c->journal, ca); } static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); bch2_dev_do_discards(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { struct bch_member *m; int ret = 0; if (ca->mi.state == new_state) return 0; if (!bch2_dev_state_allowed(c, ca, new_state, flags)) return -BCH_ERR_device_state_not_allowed; if (new_state != BCH_MEMBER_STATE_rw) __bch2_dev_read_only(c, ca); bch_notice(ca, "%s", bch2_member_states[new_state]); mutex_lock(&c->sb_lock); m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); SET_BCH_MEMBER_STATE(m, new_state); bch2_write_super(c); mutex_unlock(&c->sb_lock); if (new_state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); rebalance_wakeup(c); return ret; } int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { int ret; down_write(&c->state_lock); ret = __bch2_dev_set_state(c, ca, new_state, flags); up_write(&c->state_lock); return ret; } /* Device add/removal: */ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_member *m; unsigned dev_idx = ca->dev_idx, data; int ret; down_write(&c->state_lock); /* * We consume a reference to ca->ref, regardless of whether we succeed * or fail: */ bch2_dev_put(ca); if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot remove without losing data"); ret = -BCH_ERR_device_state_not_allowed; goto err; } __bch2_dev_read_only(c, ca); ret = bch2_dev_data_drop(c, ca->dev_idx, flags); bch_err_msg(ca, ret, "bch2_dev_data_drop()"); if (ret) goto err; ret = bch2_dev_remove_alloc(c, ca); bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); if (ret) goto err; /* * We need to flush the entire journal to get rid of keys that reference * the device being removed before removing the superblock entry */ bch2_journal_flush_all_pins(&c->journal); /* * this is really just needed for the bch2_replicas_gc_(start|end) * calls, and could be cleaned up: */ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); if (ret) goto err; ret = bch2_journal_flush(&c->journal); bch_err_msg(ca, ret, "bch2_journal_flush()"); if (ret) goto err; ret = bch2_replicas_gc2(c); bch_err_msg(ca, ret, "bch2_replicas_gc2()"); if (ret) goto err; data = bch2_dev_has_data(c, ca); if (data) { struct printbuf data_has = PRINTBUF; prt_bitflags(&data_has, __bch2_data_types, data); bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); printbuf_exit(&data_has); ret = -EBUSY; goto err; } __bch2_dev_offline(c, ca); mutex_lock(&c->sb_lock); rcu_assign_pointer(c->devs[ca->dev_idx], NULL); mutex_unlock(&c->sb_lock); #ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_kill(&ca->ref); #else ca->dying = true; bch2_dev_put(ca); #endif wait_for_completion(&ca->ref_completion); bch2_dev_free(ca); /* * Free this device's slot in the bch_member array - all pointers to * this device must be gone: */ mutex_lock(&c->sb_lock); m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); memset(&m->uuid, 0, sizeof(m->uuid)); bch2_write_super(c); mutex_unlock(&c->sb_lock); up_write(&c->state_lock); return 0; err: if (ca->mi.state == BCH_MEMBER_STATE_rw && !percpu_ref_is_zero(&ca->io_ref)) __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return ret; } /* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb; struct bch_dev *ca = NULL; struct printbuf errbuf = PRINTBUF; struct printbuf label = PRINTBUF; int ret; ret = bch2_read_super(path, &opts, &sb); bch_err_msg(c, ret, "reading super"); if (ret) goto err; struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); if (BCH_MEMBER_GROUP(&dev_mi)) { bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); if (label.allocation_failure) { ret = -ENOMEM; goto err; } } ret = bch2_dev_may_add(sb.sb, c); if (ret) goto err; ca = __bch2_dev_alloc(c, &dev_mi); if (!ca) { ret = -ENOMEM; goto err; } ret = __bch2_dev_attach_bdev(ca, &sb); if (ret) goto err; ret = bch2_dev_journal_alloc(ca, true); bch_err_msg(c, ret, "allocating journal"); if (ret) goto err; down_write(&c->state_lock); mutex_lock(&c->sb_lock); ret = bch2_sb_from_fs(c, ca); bch_err_msg(c, ret, "setting up new superblock"); if (ret) goto err_unlock; if (dynamic_fault("bcachefs:add:no_slot")) goto err_unlock; ret = bch2_sb_member_alloc(c); if (ret < 0) { bch_err_msg(c, ret, "setting up new superblock"); goto err_unlock; } unsigned dev_idx = ret; /* success: */ dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); if (BCH_MEMBER_GROUP(&dev_mi)) { ret = __bch2_dev_group_set(c, ca, label.buf); bch_err_msg(c, ret, "creating new label"); if (ret) goto err_unlock; } bch2_write_super(c); mutex_unlock(&c->sb_lock); ret = bch2_dev_usage_init(ca, false); if (ret) goto err_late; ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(ca, ret, "marking new superblock"); if (ret) goto err_late; ret = bch2_fs_freespace_init(c); bch_err_msg(ca, ret, "initializing free space"); if (ret) goto err_late; ca->new_fs_bucket_idx = 0; if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return 0; err_unlock: mutex_unlock(&c->sb_lock); up_write(&c->state_lock); err: if (ca) bch2_dev_free(ca); bch2_free_super(&sb); printbuf_exit(&label); printbuf_exit(&errbuf); bch_err_fn(c, ret); return ret; err_late: up_write(&c->state_lock); ca = NULL; goto err; } /* Hot add existing device to running filesystem: */ int bch2_dev_online(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb = { NULL }; struct bch_dev *ca; unsigned dev_idx; int ret; down_write(&c->state_lock); ret = bch2_read_super(path, &opts, &sb); if (ret) { up_write(&c->state_lock); return ret; } dev_idx = sb.sb->dev_idx; ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); bch_err_msg(c, ret, "bringing %s online", path); if (ret) goto err; ret = bch2_dev_attach_bdev(c, &sb); if (ret) goto err; ca = bch2_dev_locked(c, dev_idx); ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); if (ret) goto err; if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); if (!ca->mi.freespace_initialized) { ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); bch_err_msg(ca, ret, "initializing free space"); if (ret) goto err; } if (!ca->journal.nr) { ret = bch2_dev_journal_alloc(ca, false); bch_err_msg(ca, ret, "allocating journal"); if (ret) goto err; } mutex_lock(&c->sb_lock); bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(ktime_get_real_seconds()); bch2_write_super(c); mutex_unlock(&c->sb_lock); up_write(&c->state_lock); return 0; err: up_write(&c->state_lock); bch2_free_super(&sb); return ret; } int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) { down_write(&c->state_lock); if (!bch2_dev_is_online(ca)) { bch_err(ca, "Already offline"); up_write(&c->state_lock); return 0; } if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot offline required disk"); up_write(&c->state_lock); return -BCH_ERR_device_state_not_allowed; } __bch2_dev_offline(c, ca); up_write(&c->state_lock); return 0; } int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bch_member *m; u64 old_nbuckets; int ret = 0; down_write(&c->state_lock); old_nbuckets = ca->mi.nbuckets; if (nbuckets < ca->mi.nbuckets) { bch_err(ca, "Cannot shrink yet"); ret = -EINVAL; goto err; } if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { bch_err(ca, "New device size too big (%llu greater than max %u)", nbuckets, BCH_MEMBER_NBUCKETS_MAX); ret = -BCH_ERR_device_size_too_big; goto err; } if (bch2_dev_is_online(ca) && get_capacity(ca->disk_sb.bdev->bd_disk) < ca->mi.bucket_size * nbuckets) { bch_err(ca, "New size larger than device"); ret = -BCH_ERR_device_size_too_small; goto err; } ret = bch2_dev_buckets_resize(c, ca, nbuckets); bch_err_msg(ca, ret, "resizing buckets"); if (ret) goto err; ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); if (ret) goto err; mutex_lock(&c->sb_lock); m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); m->nbuckets = cpu_to_le64(nbuckets); bch2_write_super(c); mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_dev_data_type, .dev_data_type.dev = ca->dev_idx, .dev_data_type.data_type = BCH_DATA_free, }; u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); if (ret) goto err; } bch2_recalc_capacity(c); err: up_write(&c->state_lock); return ret; } /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { if (!strncmp(name, "/dev/", strlen("/dev/"))) name += strlen("/dev/"); for_each_member_device(c, ca) if (!strcmp(name, ca->name)) return ca; return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); } /* Filesystem open: */ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) { return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); } struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_opts opts) { DARRAY(struct bch_sb_handle) sbs = { 0 }; struct bch_fs *c = NULL; struct bch_sb_handle *best = NULL; struct printbuf errbuf = PRINTBUF; int ret = 0; if (!try_module_get(THIS_MODULE)) return ERR_PTR(-ENODEV); if (!nr_devices) { ret = -EINVAL; goto err; } ret = darray_make_room(&sbs, nr_devices); if (ret) goto err; for (unsigned i = 0; i < nr_devices; i++) { struct bch_sb_handle sb = { NULL }; ret = bch2_read_super(devices[i], &opts, &sb); if (ret) goto err; BUG_ON(darray_push(&sbs, sb)); } if (opts.nochanges && !opts.read_only) { ret = -BCH_ERR_erofs_nochanges; goto err_print; } darray_for_each(sbs, sb) if (!best || sb_cmp(sb->sb, best->sb) > 0) best = sb; darray_for_each_reverse(sbs, sb) { ret = bch2_dev_in_fs(best, sb, &opts); if (ret == -BCH_ERR_device_has_been_removed || ret == -BCH_ERR_device_splitbrain) { bch2_free_super(sb); darray_remove_item(&sbs, sb); best -= best > sb; ret = 0; continue; } if (ret) goto err_print; } c = bch2_fs_alloc(best->sb, opts); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; down_write(&c->state_lock); darray_for_each(sbs, sb) { ret = bch2_dev_attach_bdev(c, sb); if (ret) { up_write(&c->state_lock); goto err; } } up_write(&c->state_lock); if (!bch2_fs_may_start(c)) { ret = -BCH_ERR_insufficient_devices_to_start; goto err_print; } if (!c->opts.nostart) { ret = bch2_fs_start(c); if (ret) goto err; } out: darray_for_each(sbs, sb) bch2_free_super(sb); darray_exit(&sbs); printbuf_exit(&errbuf); module_put(THIS_MODULE); return c; err_print: pr_err("bch_fs_open err opening %s: %s", devices[0], bch2_err_str(ret)); err: if (!IS_ERR_OR_NULL(c)) bch2_fs_stop(c); c = ERR_PTR(ret); goto out; } /* Global interfaces/init */ static void bcachefs_exit(void) { bch2_debug_exit(); bch2_vfs_exit(); bch2_chardev_exit(); bch2_btree_key_cache_exit(); if (bcachefs_kset) kset_unregister(bcachefs_kset); } static int __init bcachefs_init(void) { bch2_bkey_pack_test(); if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || bch2_btree_key_cache_init() || bch2_chardev_init() || bch2_vfs_init() || bch2_debug_init()) goto err; return 0; err: bcachefs_exit(); return -ENOMEM; } #define BCH_DEBUG_PARAM(name, description) \ bool bch2_##name; \ module_param_named(name, bch2_##name, bool, 0644); \ MODULE_PARM_DESC(name, description); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM __maybe_unused static unsigned bch2_metadata_version = bcachefs_metadata_version_current; module_param_named(version, bch2_metadata_version, uint, 0400); module_exit(bcachefs_exit); module_init(bcachefs_init); |
110 110 110 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2013 HUAWEI * Author: Cai Zhiyong <caizhiyong@huawei.com> * * Read block device partition table from the command line. * Typically used for fixed block (eMMC) embedded devices. * It has no MBR, so saves storage space. Bootloader can be easily accessed * by absolute address of data on the block device. * Users can easily change the partition. * * The format for the command line is just like mtdparts. * * For further information, see "Documentation/block/cmdline-partition.rst" * */ #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/slab.h> #include "check.h" /* partition flags */ #define PF_RDONLY 0x01 /* Device is read only */ #define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ struct cmdline_subpart { char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ sector_t from; sector_t size; int flags; struct cmdline_subpart *next_subpart; }; struct cmdline_parts { char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ unsigned int nr_subparts; struct cmdline_subpart *subpart; struct cmdline_parts *next_parts; }; static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) { int ret = 0; struct cmdline_subpart *new_subpart; *subpart = NULL; new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); if (!new_subpart) return -ENOMEM; if (*partdef == '-') { new_subpart->size = (sector_t)(~0ULL); partdef++; } else { new_subpart->size = (sector_t)memparse(partdef, &partdef); if (new_subpart->size < (sector_t)PAGE_SIZE) { pr_warn("cmdline partition size is invalid."); ret = -EINVAL; goto fail; } } if (*partdef == '@') { partdef++; new_subpart->from = (sector_t)memparse(partdef, &partdef); } else { new_subpart->from = (sector_t)(~0ULL); } if (*partdef == '(') { partdef++; char *next = strsep(&partdef, ")"); if (!next) { pr_warn("cmdline partition format is invalid."); ret = -EINVAL; goto fail; } strscpy(new_subpart->name, next, sizeof(new_subpart->name)); } else new_subpart->name[0] = '\0'; new_subpart->flags = 0; if (!strncmp(partdef, "ro", 2)) { new_subpart->flags |= PF_RDONLY; partdef += 2; } if (!strncmp(partdef, "lk", 2)) { new_subpart->flags |= PF_POWERUP_LOCK; partdef += 2; } *subpart = new_subpart; return 0; fail: kfree(new_subpart); return ret; } static void free_subpart(struct cmdline_parts *parts) { struct cmdline_subpart *subpart; while (parts->subpart) { subpart = parts->subpart; parts->subpart = subpart->next_subpart; kfree(subpart); } } static int parse_parts(struct cmdline_parts **parts, char *bdevdef) { int ret = -EINVAL; char *next; struct cmdline_subpart **next_subpart; struct cmdline_parts *newparts; *parts = NULL; newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); if (!newparts) return -ENOMEM; next = strsep(&bdevdef, ":"); if (!next) { pr_warn("cmdline partition has no block device."); goto fail; } strscpy(newparts->name, next, sizeof(newparts->name)); newparts->nr_subparts = 0; next_subpart = &newparts->subpart; while ((next = strsep(&bdevdef, ","))) { ret = parse_subpart(next_subpart, next); if (ret) goto fail; newparts->nr_subparts++; next_subpart = &(*next_subpart)->next_subpart; } if (!newparts->subpart) { pr_warn("cmdline partition has no valid partition."); ret = -EINVAL; goto fail; } *parts = newparts; return 0; fail: free_subpart(newparts); kfree(newparts); return ret; } static void cmdline_parts_free(struct cmdline_parts **parts) { struct cmdline_parts *next_parts; while (*parts) { next_parts = (*parts)->next_parts; free_subpart(*parts); kfree(*parts); *parts = next_parts; } } static int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline) { int ret; char *buf; char *pbuf; char *next; struct cmdline_parts **next_parts; *parts = NULL; pbuf = buf = kstrdup(cmdline, GFP_KERNEL); if (!buf) return -ENOMEM; next_parts = parts; while ((next = strsep(&pbuf, ";"))) { ret = parse_parts(next_parts, next); if (ret) goto fail; next_parts = &(*next_parts)->next_parts; } if (!*parts) { pr_warn("cmdline partition has no valid partition."); ret = -EINVAL; goto fail; } ret = 0; done: kfree(buf); return ret; fail: cmdline_parts_free(parts); goto done; } static struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, const char *bdev) { while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) parts = parts->next_parts; return parts; } static char *cmdline; static struct cmdline_parts *bdev_parts; static int add_part(int slot, struct cmdline_subpart *subpart, struct parsed_partitions *state) { struct partition_meta_info *info; char tmp[sizeof(info->volname) + 4]; if (slot >= state->limit) return 1; put_partition(state, slot, subpart->from >> 9, subpart->size >> 9); if (subpart->flags & PF_RDONLY) state->parts[slot].flags |= ADDPART_FLAG_READONLY; info = &state->parts[slot].info; strscpy(info->volname, subpart->name, sizeof(info->volname)); snprintf(tmp, sizeof(tmp), "(%s)", info->volname); strlcat(state->pp_buf, tmp, PAGE_SIZE); state->parts[slot].has_info = true; return 0; } static int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, struct parsed_partitions *state) { sector_t from = 0; struct cmdline_subpart *subpart; int slot = 1; for (subpart = parts->subpart; subpart; subpart = subpart->next_subpart, slot++) { if (subpart->from == (sector_t)(~0ULL)) subpart->from = from; else from = subpart->from; if (from >= disk_size) break; if (subpart->size > (disk_size - from)) subpart->size = disk_size - from; from += subpart->size; if (add_part(slot, subpart, state)) break; } return slot; } static int __init cmdline_parts_setup(char *s) { cmdline = s; return 1; } __setup("blkdevparts=", cmdline_parts_setup); static bool has_overlaps(sector_t from, sector_t size, sector_t from2, sector_t size2) { sector_t end = from + size; sector_t end2 = from2 + size2; if (from >= from2 && from < end2) return true; if (end > from2 && end <= end2) return true; if (from2 >= from && from2 < end) return true; if (end2 > from && end2 <= end) return true; return false; } static inline void overlaps_warns_header(void) { pr_warn("Overlapping partitions are used in command line partitions."); pr_warn("Don't use filesystems on overlapping partitions:"); } static void cmdline_parts_verifier(int slot, struct parsed_partitions *state) { int i; bool header = true; for (; slot < state->limit && state->parts[slot].has_info; slot++) { for (i = slot+1; i < state->limit && state->parts[i].has_info; i++) { if (has_overlaps(state->parts[slot].from, state->parts[slot].size, state->parts[i].from, state->parts[i].size)) { if (header) { header = false; overlaps_warns_header(); } pr_warn("%s[%llu,%llu] overlaps with " "%s[%llu,%llu].", state->parts[slot].info.volname, (u64)state->parts[slot].from << 9, (u64)state->parts[slot].size << 9, state->parts[i].info.volname, (u64)state->parts[i].from << 9, (u64)state->parts[i].size << 9); } } } } /* * Purpose: allocate cmdline partitions. * Returns: * -1 if unable to read the partition table * 0 if this isn't our partition table * 1 if successful */ int cmdline_partition(struct parsed_partitions *state) { sector_t disk_size; struct cmdline_parts *parts; if (cmdline) { if (bdev_parts) cmdline_parts_free(&bdev_parts); if (cmdline_parts_parse(&bdev_parts, cmdline)) { cmdline = NULL; return -1; } cmdline = NULL; } if (!bdev_parts) return 0; parts = cmdline_parts_find(bdev_parts, state->disk->disk_name); if (!parts) return 0; disk_size = get_capacity(state->disk) << 9; cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } |
2 2 2 2 2 2 2 2 158 1 27 42 13 4 152 145 14 105 191 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | // SPDX-License-Identifier: GPL-2.0-only /* * misc.c * * PURPOSE * Miscellaneous routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1998 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc * * HISTORY * * 04/19/99 blf partial support for reading/writing specific EA's */ #include "udfdecl.h" #include <linux/fs.h> #include <linux/string.h> #include <linux/crc-itu-t.h> #include "udf_i.h" #include "udf_sb.h" struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, uint32_t type, uint8_t loc) { uint8_t *ea = NULL, *ad = NULL; int offset; uint16_t crclen; struct udf_inode_info *iinfo = UDF_I(inode); ea = iinfo->i_data; if (iinfo->i_lenEAttr) { ad = iinfo->i_data + iinfo->i_lenEAttr; } else { ad = ea; size += sizeof(struct extendedAttrHeaderDesc); } offset = inode->i_sb->s_blocksize - udf_file_entry_alloc_offset(inode) - iinfo->i_lenAlloc; /* TODO - Check for FreeEASpace */ if (loc & 0x01 && offset >= size) { struct extendedAttrHeaderDesc *eahd; eahd = (struct extendedAttrHeaderDesc *)ea; if (iinfo->i_lenAlloc) memmove(&ad[size], ad, iinfo->i_lenAlloc); if (iinfo->i_lenEAttr) { /* check checksum/crc */ if (eahd->descTag.tagIdent != cpu_to_le16(TAG_IDENT_EAHD) || le32_to_cpu(eahd->descTag.tagLocation) != iinfo->i_location.logicalBlockNum) return NULL; } else { struct udf_sb_info *sbi = UDF_SB(inode->i_sb); size -= sizeof(struct extendedAttrHeaderDesc); iinfo->i_lenEAttr += sizeof(struct extendedAttrHeaderDesc); eahd->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EAHD); if (sbi->s_udfrev >= 0x0200) eahd->descTag.descVersion = cpu_to_le16(3); else eahd->descTag.descVersion = cpu_to_le16(2); eahd->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); eahd->descTag.tagLocation = cpu_to_le32( iinfo->i_location.logicalBlockNum); eahd->impAttrLocation = cpu_to_le32(0xFFFFFFFF); eahd->appAttrLocation = cpu_to_le32(0xFFFFFFFF); } offset = iinfo->i_lenEAttr; if (type < 2048) { if (le32_to_cpu(eahd->appAttrLocation) < iinfo->i_lenEAttr) { uint32_t aal = le32_to_cpu(eahd->appAttrLocation); memmove(&ea[offset - aal + size], &ea[aal], offset - aal); offset -= aal; eahd->appAttrLocation = cpu_to_le32(aal + size); } if (le32_to_cpu(eahd->impAttrLocation) < iinfo->i_lenEAttr) { uint32_t ial = le32_to_cpu(eahd->impAttrLocation); memmove(&ea[offset - ial + size], &ea[ial], offset - ial); offset -= ial; eahd->impAttrLocation = cpu_to_le32(ial + size); } } else if (type < 65536) { if (le32_to_cpu(eahd->appAttrLocation) < iinfo->i_lenEAttr) { uint32_t aal = le32_to_cpu(eahd->appAttrLocation); memmove(&ea[offset - aal + size], &ea[aal], offset - aal); offset -= aal; eahd->appAttrLocation = cpu_to_le32(aal + size); } } /* rewrite CRC + checksum of eahd */ crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag); eahd->descTag.descCRCLength = cpu_to_le16(crclen); eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd + sizeof(struct tag), crclen)); eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); iinfo->i_lenEAttr += size; return (struct genericFormat *)&ea[offset]; } return NULL; } struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, uint8_t subtype) { struct genericFormat *gaf; uint8_t *ea = NULL; uint32_t offset; struct udf_inode_info *iinfo = UDF_I(inode); ea = iinfo->i_data; if (iinfo->i_lenEAttr) { struct extendedAttrHeaderDesc *eahd; eahd = (struct extendedAttrHeaderDesc *)ea; /* check checksum/crc */ if (eahd->descTag.tagIdent != cpu_to_le16(TAG_IDENT_EAHD) || le32_to_cpu(eahd->descTag.tagLocation) != iinfo->i_location.logicalBlockNum) return NULL; if (type < 2048) offset = sizeof(struct extendedAttrHeaderDesc); else if (type < 65536) offset = le32_to_cpu(eahd->impAttrLocation); else offset = le32_to_cpu(eahd->appAttrLocation); while (offset + sizeof(*gaf) < iinfo->i_lenEAttr) { uint32_t attrLength; gaf = (struct genericFormat *)&ea[offset]; attrLength = le32_to_cpu(gaf->attrLength); /* Detect undersized elements and buffer overflows */ if ((attrLength < sizeof(*gaf)) || (attrLength > (iinfo->i_lenEAttr - offset))) break; if (le32_to_cpu(gaf->attrType) == type && gaf->attrSubtype == subtype) return gaf; else offset += attrLength; } } return NULL; } /* * udf_read_tagged * * PURPOSE * Read the first block of a tagged descriptor. * * HISTORY * July 1, 1997 - Andrew E. Mileski * Written, tested, and released. */ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, uint32_t location, uint16_t *ident) { struct tag *tag_p; struct buffer_head *bh = NULL; u8 checksum; /* Read the block */ if (block == 0xFFFFFFFF) return NULL; bh = sb_bread(sb, block); if (!bh) { udf_err(sb, "read failed, block=%u, location=%u\n", block, location); return NULL; } tag_p = (struct tag *)(bh->b_data); *ident = le16_to_cpu(tag_p->tagIdent); if (location != le32_to_cpu(tag_p->tagLocation)) { udf_debug("location mismatch block %u, tag %u != %u\n", block, le32_to_cpu(tag_p->tagLocation), location); goto error_out; } /* Verify the tag checksum */ checksum = udf_tag_checksum(tag_p); if (checksum != tag_p->tagChecksum) { udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n", block, checksum, tag_p->tagChecksum); goto error_out; } /* Verify the tag version */ if (tag_p->descVersion != cpu_to_le16(0x0002U) && tag_p->descVersion != cpu_to_le16(0x0003U)) { udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n", le16_to_cpu(tag_p->descVersion), block); goto error_out; } /* Verify the descriptor CRC */ if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize || le16_to_cpu(tag_p->descCRC) == crc_itu_t(0, bh->b_data + sizeof(struct tag), le16_to_cpu(tag_p->descCRCLength))) return bh; udf_debug("Crc failure block %u: crc = %u, crclen = %u\n", block, le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength)); error_out: brelse(bh); return NULL; } struct buffer_head *udf_read_ptagged(struct super_block *sb, struct kernel_lb_addr *loc, uint32_t offset, uint16_t *ident) { return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset), loc->logicalBlockNum + offset, ident); } void udf_update_tag(char *data, int length) { struct tag *tptr = (struct tag *)data; length -= sizeof(struct tag); tptr->descCRCLength = cpu_to_le16(length); tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length)); tptr->tagChecksum = udf_tag_checksum(tptr); } void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, uint32_t loc, int length) { struct tag *tptr = (struct tag *)data; tptr->tagIdent = cpu_to_le16(ident); tptr->descVersion = cpu_to_le16(version); tptr->tagSerialNum = cpu_to_le16(snum); tptr->tagLocation = cpu_to_le32(loc); udf_update_tag(data, length); } u8 udf_tag_checksum(const struct tag *t) { u8 *data = (u8 *)t; u8 checksum = 0; int i; for (i = 0; i < sizeof(struct tag); ++i) if (i != 4) /* position of checksum */ checksum += data[i]; return checksum; } |
18 1 17 18 13 3 7 3 9 1 7 3 10 1 1 8 3 5 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 | // SPDX-License-Identifier: GPL-2.0-only /* * This file contians vfs file ops for 9P2000. * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/list.h> #include <linux/pagemap.h> #include <linux/utsname.h> #include <linux/uaccess.h> #include <linux/uio.h> #include <linux/slab.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" #include "cache.h" static const struct vm_operations_struct v9fs_mmap_file_vm_ops; /** * v9fs_file_open - open a file (or directory) * @inode: inode to be opened * @file: file being opened * */ int v9fs_file_open(struct inode *inode, struct file *file) { int err; struct v9fs_session_info *v9ses; struct p9_fid *fid; int omode; p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) omode = v9fs_open_to_dotl_flags(file->f_flags); else omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { fid = v9fs_fid_clone(file_dentry(file)); if (IS_ERR(fid)) return PTR_ERR(fid); if ((v9ses->cache & CACHE_WRITEBACK) && (omode & P9_OWRITE)) { int writeback_omode = (omode & ~P9_OWRITE) | P9_ORDWR; p9_debug(P9_DEBUG_CACHE, "write-only file with writeback enabled, try opening O_RDWR\n"); err = p9_client_open(fid, writeback_omode); if (err < 0) { p9_debug(P9_DEBUG_CACHE, "could not open O_RDWR, disabling caches\n"); err = p9_client_open(fid, omode); fid->mode |= P9L_DIRECT; } } else { err = p9_client_open(fid, omode); } if (err < 0) { p9_fid_put(fid); return err; } if ((file->f_flags & O_APPEND) && (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses))) generic_file_llseek(file, 0, SEEK_END); file->private_data = fid; } #ifdef CONFIG_9P_FSCACHE if (v9ses->cache & CACHE_FSCACHE) fscache_use_cookie(v9fs_inode_cookie(V9FS_I(inode)), file->f_mode & FMODE_WRITE); #endif v9fs_fid_add_modes(fid, v9ses->flags, v9ses->cache, file->f_flags); v9fs_open_fid_add(inode, &fid); return 0; } /** * v9fs_file_lock - lock a file (or directory) * @filp: file to be locked * @cmd: lock command * @fl: file lock structure * * Bugs: this looks like a local only lock, we should extend into 9P * by using open exclusive */ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); } return 0; } static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) { struct p9_flock flock; struct p9_fid *fid; uint8_t status = P9_LOCK_ERROR; int res = 0; struct v9fs_session_info *v9ses; fid = filp->private_data; BUG_ON(fid == NULL); BUG_ON((fl->c.flc_flags & FL_POSIX) != FL_POSIX); res = locks_lock_file_wait(filp, fl); if (res < 0) goto out; /* convert posix lock to p9 tlock args */ memset(&flock, 0, sizeof(flock)); /* map the lock type */ switch (fl->c.flc_type) { case F_RDLCK: flock.type = P9_LOCK_TYPE_RDLCK; break; case F_WRLCK: flock.type = P9_LOCK_TYPE_WRLCK; break; case F_UNLCK: flock.type = P9_LOCK_TYPE_UNLCK; break; } flock.start = fl->fl_start; if (fl->fl_end == OFFSET_MAX) flock.length = 0; else flock.length = fl->fl_end - fl->fl_start + 1; flock.proc_id = fl->c.flc_pid; flock.client_id = fid->clnt->name; if (IS_SETLKW(cmd)) flock.flags = P9_LOCK_FLAGS_BLOCK; v9ses = v9fs_inode2v9ses(file_inode(filp)); /* * if its a blocked request and we get P9_LOCK_BLOCKED as the status * for lock request, keep on trying */ for (;;) { res = p9_client_lock_dotl(fid, &flock, &status); if (res < 0) goto out_unlock; if (status != P9_LOCK_BLOCKED) break; if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) break; if (schedule_timeout_interruptible(v9ses->session_lock_timeout) != 0) break; /* * p9_client_lock_dotl overwrites flock.client_id with the * server message, free and reuse the client name */ if (flock.client_id != fid->clnt->name) { kfree(flock.client_id); flock.client_id = fid->clnt->name; } } /* map 9p status to VFS status */ switch (status) { case P9_LOCK_SUCCESS: res = 0; break; case P9_LOCK_BLOCKED: res = -EAGAIN; break; default: WARN_ONCE(1, "unknown lock status code: %d\n", status); fallthrough; case P9_LOCK_ERROR: case P9_LOCK_GRACE: res = -ENOLCK; break; } out_unlock: /* * incase server returned error for lock request, revert * it locally */ if (res < 0 && fl->c.flc_type != F_UNLCK) { unsigned char type = fl->c.flc_type; fl->c.flc_type = F_UNLCK; /* Even if this fails we want to return the remote error */ locks_lock_file_wait(filp, fl); fl->c.flc_type = type; } if (flock.client_id != fid->clnt->name) kfree(flock.client_id); out: return res; } static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) { struct p9_getlock glock; struct p9_fid *fid; int res = 0; fid = filp->private_data; BUG_ON(fid == NULL); posix_test_lock(filp, fl); /* * if we have a conflicting lock locally, no need to validate * with server */ if (fl->c.flc_type != F_UNLCK) return res; /* convert posix lock to p9 tgetlock args */ memset(&glock, 0, sizeof(glock)); glock.type = P9_LOCK_TYPE_UNLCK; glock.start = fl->fl_start; if (fl->fl_end == OFFSET_MAX) glock.length = 0; else glock.length = fl->fl_end - fl->fl_start + 1; glock.proc_id = fl->c.flc_pid; glock.client_id = fid->clnt->name; res = p9_client_getlock_dotl(fid, &glock); if (res < 0) goto out; /* map 9p lock type to os lock type */ switch (glock.type) { case P9_LOCK_TYPE_RDLCK: fl->c.flc_type = F_RDLCK; break; case P9_LOCK_TYPE_WRLCK: fl->c.flc_type = F_WRLCK; break; case P9_LOCK_TYPE_UNLCK: fl->c.flc_type = F_UNLCK; break; } if (glock.type != P9_LOCK_TYPE_UNLCK) { fl->fl_start = glock.start; if (glock.length == 0) fl->fl_end = OFFSET_MAX; else fl->fl_end = glock.start + glock.length - 1; fl->c.flc_pid = -glock.proc_id; } out: if (glock.client_id != fid->clnt->name) kfree(glock.client_id); return res; } /** * v9fs_file_lock_dotl - lock a file (or directory) * @filp: file to be locked * @cmd: lock command * @fl: file lock structure * */ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); int ret = -ENOLCK; p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); } if (IS_SETLK(cmd) || IS_SETLKW(cmd)) ret = v9fs_file_do_lock(filp, cmd, fl); else if (IS_GETLK(cmd)) ret = v9fs_file_getlock(filp, fl); else ret = -EINVAL; return ret; } /** * v9fs_file_flock_dotl - lock a file * @filp: file to be locked * @cmd: lock command * @fl: file lock structure * */ static int v9fs_file_flock_dotl(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); int ret = -ENOLCK; p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); if (!(fl->c.flc_flags & FL_FLOCK)) goto out_err; if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); } /* Convert flock to posix lock */ fl->c.flc_flags |= FL_POSIX; fl->c.flc_flags ^= FL_FLOCK; if (IS_SETLK(cmd) | IS_SETLKW(cmd)) ret = v9fs_file_do_lock(filp, cmd, fl); else ret = -EINVAL; out_err: return ret; } /** * v9fs_file_read_iter - read from a file * @iocb: The operation parameters * @to: The buffer to read into * */ static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct p9_fid *fid = iocb->ki_filp->private_data; p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n", fid->fid, iov_iter_count(to), iocb->ki_pos); if (fid->mode & P9L_DIRECT) return netfs_unbuffered_read_iter(iocb, to); p9_debug(P9_DEBUG_VFS, "(cached)\n"); return netfs_file_read_iter(iocb, to); } /* * v9fs_file_splice_read - splice-read from a file * @in: The 9p file to read from * @ppos: Where to find/update the file position * @pipe: The pipe to splice into * @len: The maximum amount of data to splice * @flags: SPLICE_F_* flags */ static ssize_t v9fs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct p9_fid *fid = in->private_data; p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n", fid->fid, len, *ppos); if (fid->mode & P9L_DIRECT) return copy_splice_read(in, ppos, pipe, len, flags); return filemap_splice_read(in, ppos, pipe, len, flags); } /** * v9fs_file_write_iter - write to a file * @iocb: The operation parameters * @from: The data to write * */ static ssize_t v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct p9_fid *fid = file->private_data; p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid); if (fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE)) return netfs_unbuffered_write_iter(iocb, from); p9_debug(P9_DEBUG_CACHE, "(cached)\n"); return netfs_file_write_iter(iocb, from); } static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct p9_fid *fid; struct inode *inode = filp->f_mapping->host; struct p9_wstat wstat; int retval; retval = file_write_and_wait_range(filp, start, end); if (retval) return retval; inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); retval = p9_client_wstat(fid, &wstat); inode_unlock(inode); return retval; } int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, int datasync) { struct p9_fid *fid; struct inode *inode = filp->f_mapping->host; int retval; retval = file_write_and_wait_range(filp, start, end); if (retval) return retval; inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; retval = p9_client_fsync(fid, datasync); inode_unlock(inode); return retval; } static int v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma) { int retval; struct inode *inode = file_inode(filp); struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); p9_debug(P9_DEBUG_MMAP, "filp :%p\n", filp); if (!(v9ses->cache & CACHE_WRITEBACK)) { p9_debug(P9_DEBUG_CACHE, "(read-only mmap mode)"); return generic_file_readonly_mmap(filp, vma); } retval = generic_file_mmap(filp, vma); if (!retval) vma->vm_ops = &v9fs_mmap_file_vm_ops; return retval; } static vm_fault_t v9fs_vm_page_mkwrite(struct vm_fault *vmf) { return netfs_page_mkwrite(vmf, NULL); } static void v9fs_mmap_vm_close(struct vm_area_struct *vma) { struct inode *inode; struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = WB_SYNC_ALL, .range_start = (loff_t)vma->vm_pgoff * PAGE_SIZE, /* absolute end, byte at end included */ .range_end = (loff_t)vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start - 1), }; if (!(vma->vm_flags & VM_SHARED)) return; p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); inode = file_inode(vma->vm_file); filemap_fdatawrite_wbc(inode->i_mapping, &wbc); } static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .close = v9fs_mmap_vm_close, .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, }; const struct file_operations v9fs_file_operations = { .llseek = generic_file_llseek, .read_iter = v9fs_file_read_iter, .write_iter = v9fs_file_write_iter, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock, .mmap = generic_file_readonly_mmap, .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync, .setlease = simple_nosetlease, }; const struct file_operations v9fs_file_operations_dotl = { .llseek = generic_file_llseek, .read_iter = v9fs_file_read_iter, .write_iter = v9fs_file_write_iter, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock_dotl, .flock = v9fs_file_flock_dotl, .mmap = v9fs_file_mmap, .splice_read = v9fs_file_splice_read, .splice_write = iter_file_splice_write, .fsync = v9fs_file_fsync_dotl, .setlease = simple_nosetlease, }; |
15 32 16 83 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM mptcp #if !defined(_TRACE_MPTCP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MPTCP_H #include <linux/tracepoint.h> #define show_mapping_status(status) \ __print_symbolic(status, \ { 0, "MAPPING_OK" }, \ { 1, "MAPPING_INVALID" }, \ { 2, "MAPPING_EMPTY" }, \ { 3, "MAPPING_DATA_FIN" }, \ { 4, "MAPPING_DUMMY" }) TRACE_EVENT(mptcp_subflow_get_send, TP_PROTO(struct mptcp_subflow_context *subflow), TP_ARGS(subflow), TP_STRUCT__entry( __field(bool, active) __field(bool, free) __field(u32, snd_wnd) __field(u32, pace) __field(u8, backup) __field(u64, ratio) ), TP_fast_assign( struct sock *ssk; __entry->active = mptcp_subflow_active(subflow); __entry->backup = subflow->backup || subflow->request_bkup; if (subflow->tcp_sock && sk_fullsock(subflow->tcp_sock)) __entry->free = sk_stream_memory_free(subflow->tcp_sock); else __entry->free = 0; ssk = mptcp_subflow_tcp_sock(subflow); if (ssk && sk_fullsock(ssk)) { __entry->snd_wnd = tcp_sk(ssk)->snd_wnd; __entry->pace = READ_ONCE(ssk->sk_pacing_rate); } else { __entry->snd_wnd = 0; __entry->pace = 0; } if (ssk && sk_fullsock(ssk) && __entry->pace) __entry->ratio = div_u64((u64)ssk->sk_wmem_queued << 32, __entry->pace); else __entry->ratio = 0; ), TP_printk("active=%d free=%d snd_wnd=%u pace=%u backup=%u ratio=%llu", __entry->active, __entry->free, __entry->snd_wnd, __entry->pace, __entry->backup, __entry->ratio) ); DECLARE_EVENT_CLASS(mptcp_dump_mpext, TP_PROTO(struct mptcp_ext *mpext), TP_ARGS(mpext), TP_STRUCT__entry( __field(u64, data_ack) __field(u64, data_seq) __field(u32, subflow_seq) __field(u16, data_len) __field(u16, csum) __field(u8, use_map) __field(u8, dsn64) __field(u8, data_fin) __field(u8, use_ack) __field(u8, ack64) __field(u8, mpc_map) __field(u8, frozen) __field(u8, reset_transient) __field(u8, reset_reason) __field(u8, csum_reqd) __field(u8, infinite_map) ), TP_fast_assign( __entry->data_ack = mpext->ack64 ? mpext->data_ack : mpext->data_ack32; __entry->data_seq = mpext->data_seq; __entry->subflow_seq = mpext->subflow_seq; __entry->data_len = mpext->data_len; __entry->csum = (__force u16)mpext->csum; __entry->use_map = mpext->use_map; __entry->dsn64 = mpext->dsn64; __entry->data_fin = mpext->data_fin; __entry->use_ack = mpext->use_ack; __entry->ack64 = mpext->ack64; __entry->mpc_map = mpext->mpc_map; __entry->frozen = mpext->frozen; __entry->reset_transient = mpext->reset_transient; __entry->reset_reason = mpext->reset_reason; __entry->csum_reqd = mpext->csum_reqd; __entry->infinite_map = mpext->infinite_map; ), TP_printk("data_ack=%llu data_seq=%llu subflow_seq=%u data_len=%u csum=%x use_map=%u dsn64=%u data_fin=%u use_ack=%u ack64=%u mpc_map=%u frozen=%u reset_transient=%u reset_reason=%u csum_reqd=%u infinite_map=%u", __entry->data_ack, __entry->data_seq, __entry->subflow_seq, __entry->data_len, __entry->csum, __entry->use_map, __entry->dsn64, __entry->data_fin, __entry->use_ack, __entry->ack64, __entry->mpc_map, __entry->frozen, __entry->reset_transient, __entry->reset_reason, __entry->csum_reqd, __entry->infinite_map) ); DEFINE_EVENT(mptcp_dump_mpext, mptcp_sendmsg_frag, TP_PROTO(struct mptcp_ext *mpext), TP_ARGS(mpext)); DEFINE_EVENT(mptcp_dump_mpext, get_mapping_status, TP_PROTO(struct mptcp_ext *mpext), TP_ARGS(mpext)); TRACE_EVENT(ack_update_msk, TP_PROTO(u64 data_ack, u64 old_snd_una, u64 new_snd_una, u64 new_wnd_end, u64 msk_wnd_end), TP_ARGS(data_ack, old_snd_una, new_snd_una, new_wnd_end, msk_wnd_end), TP_STRUCT__entry( __field(u64, data_ack) __field(u64, old_snd_una) __field(u64, new_snd_una) __field(u64, new_wnd_end) __field(u64, msk_wnd_end) ), TP_fast_assign( __entry->data_ack = data_ack; __entry->old_snd_una = old_snd_una; __entry->new_snd_una = new_snd_una; __entry->new_wnd_end = new_wnd_end; __entry->msk_wnd_end = msk_wnd_end; ), TP_printk("data_ack=%llu old_snd_una=%llu new_snd_una=%llu new_wnd_end=%llu msk_wnd_end=%llu", __entry->data_ack, __entry->old_snd_una, __entry->new_snd_una, __entry->new_wnd_end, __entry->msk_wnd_end) ); TRACE_EVENT(subflow_check_data_avail, TP_PROTO(__u8 status, struct sk_buff *skb), TP_ARGS(status, skb), TP_STRUCT__entry( __field(u8, status) __field(const void *, skb) ), TP_fast_assign( __entry->status = status; __entry->skb = skb; ), TP_printk("mapping_status=%s, skb=%p", show_mapping_status(__entry->status), __entry->skb) ); #endif /* _TRACE_MPTCP_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
12 5 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 | /* * linux/fs/nls/nls_iso8859-4.c * * Charset iso8859-4 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* 0x90*/ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* 0xa0*/ 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, /* 0xb0*/ 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, /* 0xc0*/ 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, /* 0xd0*/ 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, /* 0xe0*/ 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, /* 0xf0*/ 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0x00, 0x00, 0x00, 0x00, 0xad, 0x00, 0xaf, /* 0xa8-0xaf */ 0xb0, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0x00, /* 0xc0-0xc7 */ 0x00, 0xc9, 0x00, 0xcb, 0x00, 0xcd, 0xce, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0x00, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ 0x00, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0x00, /* 0xe0-0xe7 */ 0x00, 0xe9, 0x00, 0xeb, 0x00, 0xed, 0xee, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0x00, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0x00, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0xc0, 0xe0, 0x00, 0x00, 0xa1, 0xb1, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xc8, 0xe8, 0x00, 0x00, /* 0x08-0x0f */ 0xd0, 0xf0, 0xaa, 0xba, 0x00, 0x00, 0xcc, 0xec, /* 0x10-0x17 */ 0xca, 0xea, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0xab, 0xbb, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0xa5, 0xb5, 0xcf, 0xef, 0x00, 0x00, 0xc7, 0xe7, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd3, 0xf3, /* 0x30-0x37 */ 0xa2, 0x00, 0x00, 0xa6, 0xb6, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0xd1, 0xf1, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0xbd, 0xbf, 0xd2, 0xf2, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xa3, 0xb3, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xa9, 0xb9, 0x00, 0x00, 0x00, 0x00, 0xac, 0xbc, /* 0x60-0x67 */ 0xdd, 0xfd, 0xde, 0xfe, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0xd9, 0xf9, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0xae, 0xbe, 0x00, /* 0x78-0x7f */ }; static const unsigned char page02[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb7, /* 0xc0-0xc7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */ 0x00, 0xff, 0x00, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, page02, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xb1, 0xa2, 0xb3, 0xa4, 0xb5, 0xb6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xb9, 0xba, 0xbb, 0xbc, 0xad, 0xbe, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbf, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xa1, 0xb2, 0xa3, 0xb4, 0xa5, 0xa6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xa9, 0xaa, 0xab, 0xac, 0xbd, 0xae, 0xbd, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "iso8859-4", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_iso8859_4(void) { return register_nls(&table); } static void __exit exit_nls_iso8859_4(void) { unregister_nls(&table); } module_init(init_nls_iso8859_4) module_exit(exit_nls_iso8859_4) MODULE_DESCRIPTION("NLS ISO 8859-4 (Latin 4; old Baltic charset)"); MODULE_LICENSE("Dual BSD/GPL"); |
4 1 3 2 3 3 12 14 2 1 31 2 2 2 5 15 3 16 2 15 3 16 2 14 4 13 1 13 1 1 5 8 9 2 14 1 2 2 12 4 4 11 4 4 6 6 5 1 8 2 5 6 1 5 1 5 1 6 6 8 8 8 8 12 8 4 8 4 52 52 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016, Amir Vadai <amir@vadai.me> * Copyright (c) 2016, Mellanox Technologies. All rights reserved. */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/geneve.h> #include <net/vxlan.h> #include <net/erspan.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/dst.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/tc_act/tc_tunnel_key.h> #include <net/tc_act/tc_tunnel_key.h> static struct tc_action_ops act_tunnel_key_ops; TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; int action; params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); tcf_action_update_bstats(&t->common, skb); action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: skb_dst_drop(skb); break; case TCA_TUNNEL_KEY_ACT_SET: skb_dst_drop(skb); skb_dst_set(skb, dst_clone(¶ms->tcft_enc_metadata->dst)); break; default: WARN_ONCE(1, "Bad tunnel_key action %d.\n", params->tcft_action); break; } return action; } static const struct nla_policy enc_opts_policy[TCA_TUNNEL_KEY_ENC_OPTS_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_OPTS_UNSPEC] = { .strict_start_type = TCA_TUNNEL_KEY_ENC_OPTS_VXLAN }, [TCA_TUNNEL_KEY_ENC_OPTS_GENEVE] = { .type = NLA_NESTED }, [TCA_TUNNEL_KEY_ENC_OPTS_VXLAN] = { .type = NLA_NESTED }, [TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN] = { .type = NLA_NESTED }, }; static const struct nla_policy geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, }; static const struct nla_policy vxlan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP] = { .type = NLA_U32 }, }; static const struct nla_policy erspan_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX] = { .type = NLA_U32 }, [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, }; static int tunnel_key_copy_geneve_opt(const struct nlattr *nla, void *dst, int dst_len, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1]; int err, data_len, opt_len; u8 *data; err = nla_parse_nested_deprecated(tb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX, nla, geneve_opt_policy, extack); if (err < 0) return err; if (!tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] || !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] || !tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]) { NL_SET_ERR_MSG(extack, "Missing tunnel key geneve option class, type or data"); return -EINVAL; } data = nla_data(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]); data_len = nla_len(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA]); if (data_len < 4) { NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is less than 4 bytes long"); return -ERANGE; } if (data_len % 4) { NL_SET_ERR_MSG(extack, "Tunnel key geneve option data is not a multiple of 4 bytes long"); return -ERANGE; } opt_len = sizeof(struct geneve_opt) + data_len; if (dst) { struct geneve_opt *opt = dst; WARN_ON(dst_len < opt_len); opt->opt_class = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS]); opt->type = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE]); opt->length = data_len / 4; /* length is in units of 4 bytes */ opt->r1 = 0; opt->r2 = 0; opt->r3 = 0; memcpy(opt + 1, data, data_len); } return opt_len; } static int tunnel_key_copy_vxlan_opt(const struct nlattr *nla, void *dst, int dst_len, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX + 1]; int err; err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, nla, vxlan_opt_policy, extack); if (err < 0) return err; if (!tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]) { NL_SET_ERR_MSG(extack, "Missing tunnel key vxlan option gbp"); return -EINVAL; } if (dst) { struct vxlan_metadata *md = dst; md->gbp = nla_get_u32(tb[TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP]); md->gbp &= VXLAN_GBP_MASK; } return sizeof(struct vxlan_metadata); } static int tunnel_key_copy_erspan_opt(const struct nlattr *nla, void *dst, int dst_len, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX + 1]; int err; u8 ver; err = nla_parse_nested(tb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, nla, erspan_opt_policy, extack); if (err < 0) return err; if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]) { NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option ver"); return -EINVAL; } ver = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER]); if (ver == 1) { if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]) { NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option index"); return -EINVAL; } } else if (ver == 2) { if (!tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR] || !tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]) { NL_SET_ERR_MSG(extack, "Missing tunnel key erspan option dir or hwid"); return -EINVAL; } } else { NL_SET_ERR_MSG(extack, "Tunnel key erspan option ver is incorrect"); return -EINVAL; } if (dst) { struct erspan_metadata *md = dst; md->version = ver; if (ver == 1) { nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX]; md->u.index = nla_get_be32(nla); } else { nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR]; md->u.md2.dir = nla_get_u8(nla); nla = tb[TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID]; set_hwid(&md->u.md2, nla_get_u8(nla)); } } return sizeof(struct erspan_metadata); } static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst, int dst_len, struct netlink_ext_ack *extack) { int err, rem, opt_len, len = nla_len(nla), opts_len = 0, type = 0; const struct nlattr *attr, *head = nla_data(nla); err = nla_validate_deprecated(head, len, TCA_TUNNEL_KEY_ENC_OPTS_MAX, enc_opts_policy, extack); if (err) return err; nla_for_each_attr(attr, head, len, rem) { switch (nla_type(attr)) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) { NL_SET_ERR_MSG(extack, "Duplicate type for geneve options"); return -EINVAL; } opt_len = tunnel_key_copy_geneve_opt(attr, dst, dst_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; if (opts_len > IP_TUNNEL_OPTS_MAX) { NL_SET_ERR_MSG(extack, "Tunnel options exceeds max size"); return -EINVAL; } if (dst) { dst_len -= opt_len; dst += opt_len; } type = IP_TUNNEL_GENEVE_OPT_BIT; break; case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: if (type) { NL_SET_ERR_MSG(extack, "Duplicate type for vxlan options"); return -EINVAL; } opt_len = tunnel_key_copy_vxlan_opt(attr, dst, dst_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; type = IP_TUNNEL_VXLAN_OPT_BIT; break; case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: if (type) { NL_SET_ERR_MSG(extack, "Duplicate type for erspan options"); return -EINVAL; } opt_len = tunnel_key_copy_erspan_opt(attr, dst, dst_len, extack); if (opt_len < 0) return opt_len; opts_len += opt_len; type = IP_TUNNEL_ERSPAN_OPT_BIT; break; } } if (!opts_len) { NL_SET_ERR_MSG(extack, "Empty list of tunnel options"); return -EINVAL; } if (rem > 0) { NL_SET_ERR_MSG(extack, "Trailing data after parsing tunnel key options attributes"); return -EINVAL; } return opts_len; } static int tunnel_key_get_opts_len(struct nlattr *nla, struct netlink_ext_ack *extack) { return tunnel_key_copy_opts(nla, NULL, 0, extack); } static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info, int opts_len, struct netlink_ext_ack *extack) { info->options_len = opts_len; switch (nla_type(nla_data(nla))) { case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE: #if IS_ENABLED(CONFIG_INET) __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags); return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else return -EAFNOSUPPORT; #endif case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN: #if IS_ENABLED(CONFIG_INET) __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags); return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else return -EAFNOSUPPORT; #endif case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN: #if IS_ENABLED(CONFIG_INET) __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags); return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info), opts_len, extack); #else return -EAFNOSUPPORT; #endif default: NL_SET_ERR_MSG(extack, "Cannot set tunnel options for unknown tunnel type"); return -EINVAL; } } static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = { [TCA_TUNNEL_KEY_PARMS] = { .len = sizeof(struct tc_tunnel_key) }, [TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 }, [TCA_TUNNEL_KEY_ENC_IPV4_DST] = { .type = NLA_U32 }, [TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) }, [TCA_TUNNEL_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, [TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16}, [TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED }, [TCA_TUNNEL_KEY_ENC_TOS] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 }, }; static void tunnel_key_release_params(struct tcf_tunnel_key_params *p) { if (!p) return; if (p->tcft_action == TCA_TUNNEL_KEY_ACT_SET) dst_release(&p->tcft_enc_metadata->dst); kfree_rcu(p, rcu); } static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_tunnel_key_ops.net_id); bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; struct tcf_tunnel_key_params *params_new; IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *metadata = NULL; struct tcf_chain *goto_ch = NULL; struct tc_tunnel_key *parm; struct tcf_tunnel_key *t; bool exists = false; __be16 dst_port = 0; __be64 key_id = 0; int opts_len = 0; u8 tos, ttl; int ret = 0; u32 index; int err; if (!nla) { NL_SET_ERR_MSG(extack, "Tunnel requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy, extack); if (err < 0) { NL_SET_ERR_MSG(extack, "Failed to parse nested tunnel key attributes"); return err; } if (!tb[TCA_TUNNEL_KEY_PARMS]) { NL_SET_ERR_MSG(extack, "Missing tunnel key parameters"); return -EINVAL; } parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; switch (parm->t_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: break; case TCA_TUNNEL_KEY_ACT_SET: if (tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) { __be32 key32; key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]); key_id = key32_to_tunnel_id(key32); __set_bit(IP_TUNNEL_KEY_BIT, flags); } __set_bit(IP_TUNNEL_CSUM_BIT, flags); if (tb[TCA_TUNNEL_KEY_NO_CSUM] && nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM])) __clear_bit(IP_TUNNEL_CSUM_BIT, flags); if (nla_get_flag(tb[TCA_TUNNEL_KEY_NO_FRAG])) __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, flags); if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT]) dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]); if (tb[TCA_TUNNEL_KEY_ENC_OPTS]) { opts_len = tunnel_key_get_opts_len(tb[TCA_TUNNEL_KEY_ENC_OPTS], extack); if (opts_len < 0) { ret = opts_len; goto err_out; } } tos = 0; if (tb[TCA_TUNNEL_KEY_ENC_TOS]) tos = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TOS]); ttl = 0; if (tb[TCA_TUNNEL_KEY_ENC_TTL]) ttl = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TTL]); if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) { __be32 saddr; __be32 daddr; saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]); daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]); metadata = __ip_tun_set_dst(saddr, daddr, tos, ttl, dst_port, flags, key_id, opts_len); } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] && tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) { struct in6_addr saddr; struct in6_addr daddr; saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]); daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]); metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port, 0, flags, key_id, opts_len); } else { NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst"); ret = -EINVAL; goto err_out; } if (!metadata) { NL_SET_ERR_MSG(extack, "Cannot allocate tunnel metadata dst"); ret = -ENOMEM; goto err_out; } #ifdef CONFIG_DST_CACHE ret = dst_cache_init(&metadata->u.tun_info.dst_cache, GFP_KERNEL); if (ret) goto release_tun_meta; #endif if (opts_len) { ret = tunnel_key_opts_set(tb[TCA_TUNNEL_KEY_ENC_OPTS], &metadata->u.tun_info, opts_len, extack); if (ret < 0) goto release_tun_meta; } metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX; break; default: NL_SET_ERR_MSG(extack, "Unknown tunnel key action"); ret = -EINVAL; goto err_out; } if (!exists) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_tunnel_key_ops, bind, act_flags); if (ret) { NL_SET_ERR_MSG(extack, "Cannot create TC IDR"); goto release_tun_meta; } ret = ACT_P_CREATED; } else if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "TC IDR already exists"); ret = -EEXIST; goto release_tun_meta; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) { ret = err; exists = true; goto release_tun_meta; } t = to_tunnel_key(*a); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { NL_SET_ERR_MSG(extack, "Cannot allocate tunnel key parameters"); ret = -ENOMEM; exists = true; goto put_chain; } params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; spin_lock_bh(&t->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(t->params, params_new, lockdep_is_held(&t->tcf_lock)); spin_unlock_bh(&t->tcf_lock); tunnel_key_release_params(params_new); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_tun_meta: if (metadata) dst_release(&metadata->dst); err_out: if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return ret; } static void tunnel_key_release(struct tc_action *a) { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, 1); tunnel_key_release_params(params); } static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { int len = info->options_len; u8 *src = (u8 *)(info + 1); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); if (!start) return -EMSGSIZE; while (len > 0) { struct geneve_opt *opt = (struct geneve_opt *)src; if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, opt->opt_class) || nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE, opt->type) || nla_put(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA, opt->length * 4, opt + 1)) { nla_nest_cancel(skb, start); return -EMSGSIZE; } len -= sizeof(struct geneve_opt) + opt->length * 4; src += sizeof(struct geneve_opt) + opt->length * 4; } nla_nest_end(skb, start); return 0; } static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN); if (!start) return -EMSGSIZE; if (nla_put_u32(skb, TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, md->gbp)) { nla_nest_cancel(skb, start); return -EMSGSIZE; } nla_nest_end(skb, start); return 0; } static int tunnel_key_erspan_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { struct erspan_metadata *md = (struct erspan_metadata *)(info + 1); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN); if (!start) return -EMSGSIZE; if (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, md->version)) goto err; if (md->version == 1 && nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, md->u.index)) goto err; if (md->version == 2 && (nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR, md->u.md2.dir) || nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID, get_hwid(&md->u.md2)))) goto err; nla_nest_end(skb, start); return 0; err: nla_nest_cancel(skb, start); return -EMSGSIZE; } static int tunnel_key_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { struct nlattr *start; int err = -EINVAL; if (!info->options_len) return 0; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS); if (!start) return -EMSGSIZE; if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) { err = tunnel_key_geneve_opts_dump(skb, info); if (err) goto err_out; } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { err = tunnel_key_vxlan_opts_dump(skb, info); if (err) goto err_out; } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) { err = tunnel_key_erspan_opts_dump(skb, info); if (err) goto err_out; } else { err_out: nla_nest_cancel(skb, start); return err; } nla_nest_end(skb, start); return 0; } static int tunnel_key_dump_addresses(struct sk_buff *skb, const struct ip_tunnel_info *info) { unsigned short family = ip_tunnel_info_af(info); if (family == AF_INET) { __be32 saddr = info->key.u.ipv4.src; __be32 daddr = info->key.u.ipv4.dst; if (!nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_SRC, saddr) && !nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_DST, daddr)) return 0; } if (family == AF_INET6) { const struct in6_addr *saddr6 = &info->key.u.ipv6.src; const struct in6_addr *daddr6 = &info->key.u.ipv6.dst; if (!nla_put_in6_addr(skb, TCA_TUNNEL_KEY_ENC_IPV6_SRC, saddr6) && !nla_put_in6_addr(skb, TCA_TUNNEL_KEY_ENC_IPV6_DST, daddr6)) return 0; } return -EINVAL; } static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; struct tc_tunnel_key opt = { .index = t->tcf_index, .refcnt = refcount_read(&t->tcf_refcnt) - ref, .bindcnt = atomic_read(&t->tcf_bindcnt) - bind, }; struct tcf_t tm; spin_lock_bh(&t->tcf_lock); params = rcu_dereference_protected(t->params, lockdep_is_held(&t->tcf_lock)); opt.action = t->tcf_action; opt.t_action = params->tcft_action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) { struct ip_tunnel_info *info = ¶ms->tcft_enc_metadata->u.tun_info; struct ip_tunnel_key *key = &info->key; __be32 key_id = tunnel_id_to_key32(key->tun_id); if ((test_bit(IP_TUNNEL_KEY_BIT, key->tun_flags) && nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) || tunnel_key_dump_addresses(skb, ¶ms->tcft_enc_metadata->u.tun_info) || (key->tp_dst && nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT, key->tp_dst)) || nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM, !test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) || (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) && nla_put_flag(skb, TCA_TUNNEL_KEY_NO_FRAG)) || tunnel_key_opts_dump(skb, info)) goto nla_put_failure; if (key->tos && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TOS, key->tos)) goto nla_put_failure; if (key->ttl && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TTL, key->ttl)) goto nla_put_failure; } tcf_tm_dump(&tm, &t->tcf_tm); if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), &tm, TCA_TUNNEL_KEY_PAD)) goto nla_put_failure; spin_unlock_bh(&t->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&t->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_tunnel_encap_put_tunnel(void *priv) { struct ip_tunnel_info *tunnel = priv; kfree(tunnel); } static int tcf_tunnel_encap_get_tunnel(struct flow_action_entry *entry, const struct tc_action *act) { entry->tunnel = tcf_tunnel_info_copy(act); if (!entry->tunnel) return -ENOMEM; entry->destructor = tcf_tunnel_encap_put_tunnel; entry->destructor_priv = entry->tunnel; return 0; } static int tcf_tunnel_key_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { int err; if (bind) { struct flow_action_entry *entry = entry_data; if (is_tcf_tunnel_set(act)) { entry->id = FLOW_ACTION_TUNNEL_ENCAP; err = tcf_tunnel_encap_get_tunnel(entry, act); if (err) return err; } else if (is_tcf_tunnel_release(act)) { entry->id = FLOW_ACTION_TUNNEL_DECAP; } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel key mode offload"); return -EOPNOTSUPP; } *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; if (is_tcf_tunnel_set(act)) fl_action->id = FLOW_ACTION_TUNNEL_ENCAP; else if (is_tcf_tunnel_release(act)) fl_action->id = FLOW_ACTION_TUNNEL_DECAP; else return -EOPNOTSUPP; } return 0; } static struct tc_action_ops act_tunnel_key_ops = { .kind = "tunnel_key", .id = TCA_ID_TUNNEL_KEY, .owner = THIS_MODULE, .act = tunnel_key_act, .dump = tunnel_key_dump, .init = tunnel_key_init, .cleanup = tunnel_key_release, .offload_act_setup = tcf_tunnel_key_offload_act_setup, .size = sizeof(struct tcf_tunnel_key), }; MODULE_ALIAS_NET_ACT("tunnel_key"); static __net_init int tunnel_key_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_tunnel_key_ops.net_id); return tc_action_net_init(net, tn, &act_tunnel_key_ops); } static void __net_exit tunnel_key_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_tunnel_key_ops.net_id); } static struct pernet_operations tunnel_key_net_ops = { .init = tunnel_key_init_net, .exit_batch = tunnel_key_exit_net, .id = &act_tunnel_key_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init tunnel_key_init_module(void) { return tcf_register_action(&act_tunnel_key_ops, &tunnel_key_net_ops); } static void __exit tunnel_key_cleanup_module(void) { tcf_unregister_action(&act_tunnel_key_ops, &tunnel_key_net_ops); } module_init(tunnel_key_init_module); module_exit(tunnel_key_cleanup_module); MODULE_AUTHOR("Amir Vadai <amir@vadai.me>"); MODULE_DESCRIPTION("ip tunnel manipulation actions"); MODULE_LICENSE("GPL v2"); |
884 627 810 159 811 733 735 453 735 735 458 822 270 284 702 6 698 564 563 2 238 493 2 452 490 507 30 521 686 686 30 727 518 317 696 474 297 1200 716 520 777 490 288 575 390 186 431 307 125 237 123 114 384 233 151 246 150 96 82 47 35 61 24 37 421 179 418 421 420 420 288 135 421 421 418 287 135 241 36 218 219 295 86 258 47 225 19 5 15 15 41 97 59 42 42 686 8 687 134 2 133 39 3 37 46 2 44 38 7 38 7 40 40 4 130 11 9 121 121 1 11 32 24 8 32 367 368 15 96 107 105 15 15 11 12 12 12 4 7 4 7 11 11 11 11 11 11 11 826 825 519 214 124 306 521 474 474 299 213 25 477 474 6 477 29 26 1 2 31 621 588 586 587 52 2 1 1 50 28 26 2 1 569 547 565 566 30 15 14 25 33 30 31 10 123 13 530 96 25 1 20 1 1 51 26 22 4 23 1 24 21 33 42 33 24 1 36 1 3 2 1 25 28 5 3 8 47 28 17 3 1 331 38 328 235 515 235 235 69 29 42 68 222 82 3 533 15 455 91 92 91 74 17 442 83 6 7 519 3 520 60 462 34 489 504 2 45 3 531 2 2 20 521 18 556 556 1 556 1 2 554 530 2 490 59 1 60 56 4 549 539 538 540 537 539 539 538 537 538 539 539 537 538 22 26 1 26 1 24 24 27 27 27 27 23 24 23 24 27 27 2 21 6 24 3 27 27 24 3 21 6 20 8 27 8 12 27 27 24 3 27 1 26 27 27 27 27 388 40 52 5 10 286 286 301 297 4 299 326 63 135 135 134 155 20 134 135 135 135 311 312 58 162 91 444 3 445 448 433 447 447 447 440 6 436 9 2 442 438 5 434 8 1 438 437 3 435 3 3 434 434 45 186 245 430 387 275 102 71 430 408 22 410 397 498 107 7 1 437 4 7 38 48 8 44 278 23 10 10 277 292 383 14 86 292 292 120 277 291 120 278 298 35 35 35 35 35 35 35 35 35 333 335 333 334 335 311 4 18 381 190 189 381 234 464 57 395 126 399 123 455 67 521 397 52 73 434 87 124 396 104 416 521 465 1 124 1 336 1 2 3 30 427 433 433 430 29 1 2 1 138 336 1 10 3 1 8 4 502 2 522 2 522 2 521 1 14 4 518 345 498 477 15 7 472 2 23 2 473 20 7 466 22 8 1 478 477 1 1 477 457 1 454 1 454 453 448 1 450 448 1 448 271 176 1 447 447 447 1 447 21 21 19 501 66 2 1 24 39 63 435 531 3 1 1 1 1 349 177 1 176 4 467 1 1 251 41 70 8 93 476 475 137 2 336 1 2 4 467 373 20 512 532 532 3 525 522 521 515 12 504 3 23 9 2 8 8 16 380 53 281 5 146 329 6 328 21 94 1 3 3 3 2 393 26 126 241 394 300 93 335 9 4 341 40 68 302 11 15 354 2 368 213 155 2 273 93 353 353 150 202 41 40 2 170 172 172 352 531 522 10 532 353 353 179 533 34 10 2 22 5 7 5 1 1 7 14 6 2 12 12 7 5 573 574 52 538 575 574 575 330 190 356 421 177 383 575 575 575 575 571 1 73 5 70 13 23 14 23 60 32 44 7 7 7 7 55 7 6 49 51 4 54 1 55 2 1 52 19 1 1 15 16 2 15 18 1 14 2 2 10 41 1 12 33 35 3 1 7 37 5 21 23 41 30 10 15 15 15 62 7 55 15 41 38 3 35 38 38 38 2 38 122 121 5 122 191 190 8 140 68 123 122 1 4 4 4 1 1 1 1 2 29 131 8 123 142 134 132 131 105 32 34 9 25 32 7 1 247 1 246 2 57 238 181 12 30 179 177 49 4 4 4 2 1 192 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/super.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> #include <linux/time.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/vfs.h> #include <linux/random.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/ctype.h> #include <linux/log2.h> #include <linux/crc16.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/unicode.h> #include <linux/part_stat.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/fsnotify.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "mballoc.h" #include "fsmap.h" #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> static struct ext4_lazy_init *ext4_li_info; static DEFINE_MUTEX(ext4_li_mtx); static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_show_options(struct seq_file *seq, struct dentry *root); static void ext4_update_super(struct super_block *sb); static int ext4_commit_super(struct super_block *sb); static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum); static int ext4_validate_options(struct fs_context *fc); static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb); static void ext4_apply_options(struct fs_context *fc, struct super_block *sb); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param); static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); static void ext4_fc_free(struct fs_context *fc); static int ext4_init_fs_context(struct fs_context *fc); static void ext4_kill_sb(struct super_block *sb); static const struct fs_parameter_spec ext4_param_specs[]; /* * Lock ordering * * page fault path: * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start * -> page lock -> i_data_sem (rw) * * buffered write path: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> page lock -> * i_data_sem (rw) * * truncate: * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> * page lock * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start -> * i_data_sem (rw) * * direct IO: * sb_start_write -> i_mutex -> mmap_lock * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw) * * writepages: * transaction start -> page lock(s) -> i_data_sem (rw) */ static const struct fs_context_operations ext4_context_ops = { .parse_param = ext4_parse_param, .get_tree = ext4_get_tree, .reconfigure = ext4_reconfigure, .free = ext4_fc_free, }; #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); #define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) #endif static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); #define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type) static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail) { if (simu_fail) { clear_buffer_uptodate(bh); unlock_buffer(bh); return; } /* * buffer's verified bit is no longer valid after reading from * disk again due to write out error, clear it to make sure we * recheck the buffer contents. */ clear_buffer_verified(bh); bh->b_end_io = end_io ? end_io : end_buffer_read_sync; get_bh(bh); submit_bh(REQ_OP_READ | op_flags, bh); } void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return; } __ext4_read_bh(bh, op_flags, end_io, simu_fail); } int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); if (ext4_buffer_uptodate(bh)) { unlock_buffer(bh); return 0; } __ext4_read_bh(bh, op_flags, end_io, simu_fail); wait_on_buffer(bh); if (buffer_uptodate(bh)) return 0; return -EIO; } int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { lock_buffer(bh); if (!wait) { ext4_read_bh_nowait(bh, op_flags, NULL, false); return 0; } return ext4_read_bh(bh, op_flags, NULL, false); } /* * This works like __bread_gfp() except it uses ERR_PTR for error * returns. Currently with sb_bread it's impossible to distinguish * between ENOMEM and EIO situations (since both result in a NULL * return. */ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, sector_t block, blk_opf_t op_flags, gfp_t gfp) { struct buffer_head *bh; int ret; bh = sb_getblk_gfp(sb, block, gfp); if (bh == NULL) return ERR_PTR(-ENOMEM); if (ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | op_flags, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, ~__GFP_FS) | __GFP_MOVABLE; return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); } struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, ~__GFP_FS); return __ext4_sb_bread_gfp(sb, block, 0, gfp); } void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN); if (likely(bh)) { if (trylock_buffer(bh)) ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false); brelse(bh); } } static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } __le32 ext4_superblock_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; csum = ext4_chksum(sbi, ~0, (char *)es, offset); return cpu_to_le32(csum); } static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { if (!ext4_has_metadata_csum(sb)) return 1; return es->s_checksum == ext4_superblock_csum(sb, es); } void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (!ext4_has_metadata_csum(sb)) return; es->s_checksum = ext4_superblock_csum(sb, es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_block_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg) { return le32_to_cpu(bg->bg_inode_table_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } __u32 ext4_free_group_clusters(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_free_blocks_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); } __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0); } __u32 ext4_used_dirs_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_used_dirs_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); } __u32 ext4_itable_unused_count(struct super_block *sb, struct ext4_group_desc *bg) { return le16_to_cpu(bg->bg_itable_unused_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); } void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_bitmap_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32); } void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { bg->bg_inode_table_lo = cpu_to_le32((u32)blk); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } void ext4_free_group_clusters_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); } void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count)); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16)); } void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); } void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); } static void __ext4_update_tstamp(__le32 *lo, __u8 *hi, time64_t now) { now = clamp_val(now, 0, (1ull << 40) - 1); *lo = cpu_to_le32(lower_32_bits(now)); *hi = upper_32_bits(now); } static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) { return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo); } #define ext4_update_tstamp(es, tstamp) \ __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi, \ ktime_get_real_seconds()) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) #define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ #define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ /* * The ext4_maybe_update_superblock() function checks and updates the * superblock if needed. * * This function is designed to update the on-disk superblock only under * certain conditions to prevent excessive disk writes and unnecessary * waking of the disk from sleep. The superblock will be updated if: * 1. More than an hour has passed since the last superblock update, and * 2. More than 16MB have been written since the last superblock update. * * @sb: The superblock */ static void ext4_maybe_update_superblock(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; journal_t *journal = sbi->s_journal; time64_t now; __u64 last_update; __u64 lifetime_write_kbytes; __u64 diff_size; if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || !journal || (journal->j_flags & JBD2_UNMOUNT)) return; now = ktime_get_real_seconds(); last_update = ext4_get_tstamp(es, s_wtime); if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) return; lifetime_write_kbytes = sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); /* Get the number of kilobytes not written to disk to account * for statistics and compare with a multiple of 16 MB. This * is used to determine when the next superblock commit should * occur (i.e. not more often than once per 16MB if there was * less written in an hour). */ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int error = is_journal_aborted(journal); struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); ext4_maybe_update_superblock(sb); spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { jce = list_entry(txn->t_private_list.next, struct ext4_journal_cb_entry, jce_list); list_del_init(&jce->jce_list); spin_unlock(&sbi->s_md_lock); jce->jce_func(sb, jce, error); spin_lock(&sbi->s_md_lock); } spin_unlock(&sbi->s_md_lock); } /* * This writepage callback for write_cache_pages() * takes care of a few cases after page cleaning. * * write_cache_pages() already checks for dirty pages * and calls clear_page_dirty_for_io(), which we want, * to write protect the pages. * * However, we may have to redirty a page (see below.) */ static int ext4_journalled_writepage_callback(struct folio *folio, struct writeback_control *wbc, void *data) { transaction_t *transaction = (transaction_t *) data; struct buffer_head *bh, *head; struct journal_head *jh; bh = head = folio_buffers(folio); do { /* * We have to redirty a page in these cases: * 1) If buffer is dirty, it means the page was dirty because it * contains a buffer that needs checkpointing. So the dirty bit * needs to be preserved so that checkpointing writes the buffer * properly. * 2) If buffer is not part of the committing transaction * (we may have just accidentally come across this buffer because * inode range tracking is not exact) or if the currently running * transaction already contains this buffer as well, dirty bit * needs to be preserved so that the buffer gets writeprotected * properly on running transaction's commit. */ jh = bh2jh(bh); if (buffer_dirty(bh) || (jh && (jh->b_transaction != transaction || jh->b_next_transaction))) { folio_redirty_for_writepage(wbc, folio); goto out; } } while ((bh = bh->b_this_page) != head); out: return AOP_WRITEPAGE_ACTIVATE; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct address_space *mapping = jinode->i_vfs_inode->i_mapping; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; return write_cache_pages(mapping, &wbc, ext4_journalled_writepage_callback, jinode->i_transaction); } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) { int ret; if (ext4_should_journal_data(jinode->i_vfs_inode)) ret = ext4_journalled_submit_inode_data_buffers(jinode); else ret = ext4_normal_submit_inode_data_buffers(jinode); return ret; } static int ext4_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) { int ret = 0; if (!ext4_should_journal_data(jinode->i_vfs_inode)) ret = jbd2_journal_finish_inode_data_buffers(jinode); return ret; } static bool system_going_down(void) { return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF || system_state == SYSTEM_RESTART; } struct ext4_err_translation { int code; int errno; }; #define EXT4_ERR_TRANSLATE(err) { .code = EXT4_ERR_##err, .errno = err } static struct ext4_err_translation err_translation[] = { EXT4_ERR_TRANSLATE(EIO), EXT4_ERR_TRANSLATE(ENOMEM), EXT4_ERR_TRANSLATE(EFSBADCRC), EXT4_ERR_TRANSLATE(EFSCORRUPTED), EXT4_ERR_TRANSLATE(ENOSPC), EXT4_ERR_TRANSLATE(ENOKEY), EXT4_ERR_TRANSLATE(EROFS), EXT4_ERR_TRANSLATE(EFBIG), EXT4_ERR_TRANSLATE(EEXIST), EXT4_ERR_TRANSLATE(ERANGE), EXT4_ERR_TRANSLATE(EOVERFLOW), EXT4_ERR_TRANSLATE(EBUSY), EXT4_ERR_TRANSLATE(ENOTDIR), EXT4_ERR_TRANSLATE(ENOTEMPTY), EXT4_ERR_TRANSLATE(ESHUTDOWN), EXT4_ERR_TRANSLATE(EFAULT), }; static int ext4_errno_to_code(int errno) { int i; for (i = 0; i < ARRAY_SIZE(err_translation); i++) if (err_translation[i].errno == errno) return err_translation[i].code; return EXT4_ERR_UNKNOWN; } static void save_error_info(struct super_block *sb, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* We default to EFSCORRUPTED error... */ if (error == 0) error = EFSCORRUPTED; spin_lock(&sbi->s_error_lock); sbi->s_add_error_count++; sbi->s_last_error_code = error; sbi->s_last_error_line = line; sbi->s_last_error_ino = ino; sbi->s_last_error_block = block; sbi->s_last_error_func = func; sbi->s_last_error_time = ktime_get_real_seconds(); if (!sbi->s_first_error_time) { sbi->s_first_error_code = error; sbi->s_first_error_line = line; sbi->s_first_error_ino = ino; sbi->s_first_error_block = block; sbi->s_first_error_func = func; sbi->s_first_error_time = sbi->s_last_error_time; } spin_unlock(&sbi->s_error_lock); } /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * * On ext2, we can store the error state of the filesystem in the * superblock. That is not possible on ext4, because we may have other * write ordering constraints on the superblock which prevent us from * writing it out straight away; and given that the journal is about to * be aborted, we can't rely on the current, or future, transactions to * write out the superblock safely. * * We'll just use the jbd2_journal_abort() error code to record an error in * the journal instead. On recovery, the journal will complain about * that error until we've noted it down and cleared it. * * If force_ro is set, we unconditionally force the filesystem into an * ABORT|READONLY state, unless the error response on the fs has been set to * panic in which case we take the easy way out and panic immediately. This is * used to deal with unrecoverable failures such as journal IO errors or ENOMEM * at a critical moment in log management. */ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, __u32 ino, __u64 block, const char *func, unsigned int line) { journal_t *journal = EXT4_SB(sb)->s_journal; bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); if (!continue_fs && !sb_rdonly(sb)) { set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); if (journal) jbd2_journal_abort(journal, -EIO); } if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); /* * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we * defer superblock flushing to a workqueue. */ if (continue_fs && journal) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already * disabled. */ if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } if (sb_rdonly(sb) || continue_fs) return; ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * EXT4_FLAGS_SHUTDOWN was set which stops all filesystem * modifications. We don't set SB_RDONLY because that requires * sb->s_umount semaphore and setting it without proper remount * procedure is confusing code such as freeze_super() leading to * deadlocks and other problems. */ } static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; /* * If the journal is still running, we have to write out superblock * through the journal to avoid collisions of other journalled sb * updates. * * We use directly jbd2 functions here to avoid recursing back into * ext4 error handling code during handling of previous errors. */ if (!sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; bool call_notify_err = false; handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; if (jbd2_journal_get_write_access(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } if (sbi->s_add_error_count > 0) call_notify_err = true; ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } if (jbd2_journal_dirty_metadata(handle, sbh)) { jbd2_journal_stop(handle); goto write_directly; } jbd2_journal_stop(handle); if (call_notify_err) ext4_notify_error_sysfs(sbi); return; } write_directly: /* * Write through journal failed. Write sb directly to get error info * out and hope for the best. */ ext4_commit_super(sbi->s_sb); ext4_notify_error_sysfs(sbi); } #define ext4_error_ratelimit(sb) \ ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \ "EXT4-fs error") void __ext4_error(struct super_block *sb, const char *function, unsigned int line, bool force_ro, int error, __u64 block, const char *fmt, ...) { struct va_format vaf; va_list args; if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n", sb->s_id, function, line, current->comm, &vaf); va_end(args); } fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); ext4_handle_error(sb, force_ro, error, 0, block, function, line); } void __ext4_error_inode(struct inode *inode, const char *function, unsigned int line, ext4_fsblk_t block, int error, const char *fmt, ...) { va_list args; struct va_format vaf; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: block %llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, function, line); } void __ext4_error_file(struct file *file, const char *function, unsigned int line, ext4_fsblk_t block, const char *fmt, ...) { va_list args; struct va_format vaf; struct inode *inode = file_inode(file); char pathname[80], *path; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); if (ext4_error_ratelimit(inode->i_sb)) { path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "block %llu: comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, path, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: " "comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, path, &vaf); va_end(args); } fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, function, line); } const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]) { char *errstr = NULL; switch (errno) { case -EFSCORRUPTED: errstr = "Corrupt filesystem"; break; case -EFSBADCRC: errstr = "Filesystem failed CRC"; break; case -EIO: errstr = "IO failure"; break; case -ENOMEM: errstr = "Out of memory"; break; case -EROFS: if (!sb || (EXT4_SB(sb)->s_journal && EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) errstr = "Journal has aborted"; else errstr = "Readonly filesystem"; break; default: /* If the caller passed in an extra buffer for unknown * errors, textualise them now. Else we just return * NULL. */ if (nbuf) { /* Check for truncated error codes... */ if (snprintf(nbuf, 16, "error %d", -errno) >= 0) errstr = nbuf; } break; } return errstr; } /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ void __ext4_std_error(struct super_block *sb, const char *function, unsigned int line, int errno) { char nbuf[16]; const char *errstr; if (unlikely(ext4_forced_shutdown(sb))) return; /* Special case: if the error is EROFS, and we're not already * inside a transaction, then there's really no point in logging * an error. */ if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb)) return; if (ext4_error_ratelimit(sb)) { errstr = ext4_decode_error(sb, errno, nbuf); printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); } fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); ext4_handle_error(sb, false, -errno, 0, 0, function, line); } void __ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { struct va_format vaf; va_list args; if (sb) { atomic_inc(&EXT4_SB(sb)->s_msg_count); if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs")) return; } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (sb) printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf); else printk("%sEXT4-fs: %pV\n", prefix, &vaf); va_end(args); } static int ext4_warning_ratelimit(struct super_block *sb) { atomic_inc(&EXT4_SB(sb)->s_warning_count); return ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), "EXT4-fs warning"); } void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n", sb->s_id, function, line, &vaf); va_end(args); } void __ext4_warning_inode(const struct inode *inode, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; if (!ext4_warning_ratelimit(inode->i_sb)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, unsigned long ino, ext4_fsblk_t block, const char *fmt, ...) __releases(bitlock) __acquires(bitlock) { struct va_format vaf; va_list args; if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", sb->s_id, function, line, grp); if (ino) printk(KERN_CONT "inode %lu: ", ino); if (block) printk(KERN_CONT "block %llu:", (unsigned long long) block); printk(KERN_CONT "%pV\n", &vaf); va_end(args); } if (test_opt(sb, ERRORS_CONT)) { if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } ext4_unlock_group(sb, grp); ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line); /* * We only get here in the ERRORS_RO case; relocking the group * may be dangerous, but nothing bad will happen since the * filesystem will have already been marked read/only and the * journal has been aborted. We return 1 as a hint to callers * who might what to use the return value from * ext4_grp_locked_error() to distinguish between the * ERRORS_CONT and ERRORS_RO case, and perhaps return more * aggressively from the ext4 function in question, with a * more appropriate error code. */ ext4_lock_group(sb, grp); return; } void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t group, unsigned int flags) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; if (!grp || !gdp) return; if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret) percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free); } if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) { ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); if (!ret && gdp) { int count; count = ext4_free_inodes_count(sb, gdp); percpu_counter_sub(&sbi->s_freeinodes_counter, count); } } } void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) return; ext4_warning(sb, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT4_DYNAMIC_REV); es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO); es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE); es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV); /* leave es->s_feature_*compat flags alone */ /* es->s_uuid will be set by e2fsck if empty */ /* * The rest of the superblock fields should be zero, and if not it * means they are likely already in use, so leave them alone. We * can leave it up to e2fsck to clean up any inconsistencies there. */ } static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; } static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) { struct list_head *l; ext4_msg(sb, KERN_ERR, "sb orphan head is %d", le32_to_cpu(sbi->s_es->s_last_orphan)); printk(KERN_ERR "sb_info orphan list:\n"); list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); printk(KERN_ERR " " "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, inode->i_mode, inode->i_nlink, NEXT_ORPHAN(inode)); } } #ifdef CONFIG_QUOTA static int ext4_quota_off(struct super_block *sb, int type); static inline void ext4_quotas_off(struct super_block *sb, int type) { BUG_ON(type > EXT4_MAXQUOTAS); /* Use our quota_off function to clear inode flags etc. */ for (type--; type >= 0; type--) ext4_quota_off(sb, type); } /* * This is a helper function which is used in the mount/remount * codepaths (which holds s_umount) to fetch the quota file name. */ static inline char *get_qf_name(struct super_block *sb, struct ext4_sb_info *sbi, int type) { return rcu_dereference_protected(sbi->s_qf_names[type], lockdep_is_held(&sb->s_umount)); } #else static inline void ext4_quotas_off(struct super_block *sb, int type) { } #endif static int ext4_percpu_param_init(struct ext4_sb_info *sbi) { ext4_fsblk_t block; int err; block = ext4_count_free_clusters(sbi->s_sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sbi->s_sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sbi->s_sb), GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, GFP_KERNEL); if (!err) err = percpu_init_rwsem(&sbi->s_writepages_rwsem); if (err) ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory"); return err; } static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) { percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); percpu_free_rwsem(&sbi->s_writepages_rwsem); } static void ext4_group_desc_free(struct ext4_sb_info *sbi) { struct buffer_head **group_desc; int i; rcu_read_lock(); group_desc = rcu_dereference(sbi->s_group_desc); for (i = 0; i < sbi->s_gdb_count; i++) brelse(group_desc[i]); kvfree(group_desc); rcu_read_unlock(); } static void ext4_flex_groups_free(struct ext4_sb_info *sbi) { struct flex_groups **flex_groups; int i; rcu_read_lock(); flex_groups = rcu_dereference(sbi->s_flex_groups); if (flex_groups) { for (i = 0; i < sbi->s_flex_groups_allocated; i++) kvfree(flex_groups[i]); kvfree(flex_groups); } rcu_read_unlock(); } static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int aborted = 0; int err; /* * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) ext4_msg(sb, KERN_INFO, "unmounting filesystem %pU.", &sb->s_uuid); ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } } ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); if (!sb_rdonly(sb) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); ext4_clear_feature_orphan_present(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } if (!sb_rdonly(sb)) ext4_commit_super(sb); ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) && percpu_counter_sum(&sbi->s_dirtyclusters_counter)); ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA for (int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif /* Debugging code just in case the in-memory inode orphan list * isn't empty. The on-disk one can be non-empty if we've * detected an error and taken the fs readonly, but the * in-memory list had better be clean by this point. */ if (!list_empty(&sbi->s_orphan)) dump_orphan_list(sb, sbi); ASSERT(list_empty(&sbi->s_orphan)); sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->s_journal_bdev_file) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ sync_blockdev(file_bdev(sbi->s_journal_bdev_file)); invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; ext4_stop_mmpd(sbi); brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the * superblock, we need to actually destroy the kobject. */ kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif kfree(sbi); } static struct kmem_cache *ext4_inode_cachep; /* * Called inside transaction, so use GFP_NOFS */ static struct inode *ext4_alloc_inode(struct super_block *sb) { struct ext4_inode_info *ei; ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS); if (!ei) return NULL; inode_set_iversion(&ei->vfs_inode, 1); ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); ei->i_prealloc_node = RB_ROOT; atomic_set(&ei->i_prealloc_active, 0); rwlock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_list); ei->i_es_all_nr = 0; ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); mutex_init(&ei->i_fc_lock); return &ei->vfs_inode; } static int ext4_drop_inode(struct inode *inode) { int drop = generic_drop_inode(inode); if (!drop) drop = fscrypt_drop_inode(inode); trace_ext4_drop_inode(inode, drop); return drop; } static void ext4_free_in_core_inode(struct inode *inode) { fscrypt_free_inode(inode); if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { pr_warn("%s: inode %ld still in fc list", __func__, inode->i_ino); } kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); } static void ext4_destroy_inode(struct inode *inode) { if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), true); dump_stack(); } if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) && WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks)) ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", inode->i_ino, EXT4_I(inode), EXT4_I(inode)->i_reserved_data_blocks); } static void ext4_shutdown(struct super_block *sb) { ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH); } static void init_once(void *foo) { struct ext4_inode_info *ei = foo; INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); } static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, offsetof(struct ext4_inode_info, i_data), sizeof_field(struct ext4_inode_info, i_data), init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ext4_inode_cachep); } void ext4_clear_inode(struct inode *inode) { ext4_fc_del(inode); invalidate_inode_buffers(inode); clear_inode(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); dquot_drop(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } fscrypt_put_encryption_info(inode); fsverity_cleanup_inode(inode); } static struct inode *ext4_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct inode *inode; /* * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return inode; } static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static int ext4_nfs_commit_metadata(struct inode *inode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL }; trace_ext4_nfs_commit_metadata(inode); return ext4_write_inode(inode, &wbc); } #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static struct dquot __rcu **ext4_get_dquots(struct inode *inode) { return EXT4_I(inode)->i_dquot; } static const struct dquot_operations ext4_quota_operations = { .get_reserved_space = ext4_get_reserved_space, .write_dquot = ext4_write_dquot, .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, .mark_dirty = ext4_mark_dquot_dirty, .write_info = ext4_write_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_projid = ext4_get_projid, .get_inode_usage = ext4_get_inode_usage, .get_next_id = dquot_get_next_id, }; static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, .quota_off = ext4_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk, .get_nextdqblk = dquot_get_next_dqblk, }; #endif static const struct super_operations ext4_sops = { .alloc_inode = ext4_alloc_inode, .free_inode = ext4_free_in_core_inode, .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, .freeze_fs = ext4_freeze, .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, .show_options = ext4_show_options, .shutdown = ext4_shutdown, #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, .get_dquots = ext4_get_dquots, #endif }; static const struct export_operations ext4_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, .get_parent = ext4_get_parent, .commit_metadata = ext4_nfs_commit_metadata, }; enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_nouid32, Opt_debug, Opt_removed, Opt_user_xattr, Opt_acl, Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, Opt_inlinecrypt, Opt_usrjquota, Opt_grpjquota, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, Opt_nowarn_on_error, Opt_mblk_io_submit, Opt_debug_want_extra_isize, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif }; static const struct constant_table ext4_param_errors[] = { {"continue", EXT4_MOUNT_ERRORS_CONT}, {"panic", EXT4_MOUNT_ERRORS_PANIC}, {"remount-ro", EXT4_MOUNT_ERRORS_RO}, {} }; static const struct constant_table ext4_param_data[] = { {"journal", EXT4_MOUNT_JOURNAL_DATA}, {"ordered", EXT4_MOUNT_ORDERED_DATA}, {"writeback", EXT4_MOUNT_WRITEBACK_DATA}, {} }; static const struct constant_table ext4_param_data_err[] = { {"abort", Opt_data_err_abort}, {"ignore", Opt_data_err_ignore}, {} }; static const struct constant_table ext4_param_jqfmt[] = { {"vfsold", QFMT_VFS_OLD}, {"vfsv0", QFMT_VFS_V0}, {"vfsv1", QFMT_VFS_V1}, {} }; static const struct constant_table ext4_param_dax[] = { {"always", Opt_dax_always}, {"inode", Opt_dax_inode}, {"never", Opt_dax_never}, {} }; /* * Mount option specification * We don't use fsparam_flag_no because of the way we set the * options and the way we show them in _ext4_show_options(). To * keep the changes to a minimum, let's keep the negative options * separate for now. */ static const struct fs_parameter_spec ext4_param_specs[] = { fsparam_flag ("bsddf", Opt_bsd_df), fsparam_flag ("minixdf", Opt_minix_df), fsparam_flag ("grpid", Opt_grpid), fsparam_flag ("bsdgroups", Opt_grpid), fsparam_flag ("nogrpid", Opt_nogrpid), fsparam_flag ("sysvgroups", Opt_nogrpid), fsparam_gid ("resgid", Opt_resgid), fsparam_uid ("resuid", Opt_resuid), fsparam_u32 ("sb", Opt_sb), fsparam_enum ("errors", Opt_errors, ext4_param_errors), fsparam_flag ("nouid32", Opt_nouid32), fsparam_flag ("debug", Opt_debug), fsparam_flag ("oldalloc", Opt_removed), fsparam_flag ("orlov", Opt_removed), fsparam_flag ("user_xattr", Opt_user_xattr), fsparam_flag ("acl", Opt_acl), fsparam_flag ("norecovery", Opt_noload), fsparam_flag ("noload", Opt_noload), fsparam_flag ("bh", Opt_removed), fsparam_flag ("nobh", Opt_removed), fsparam_u32 ("commit", Opt_commit), fsparam_u32 ("min_batch_time", Opt_min_batch_time), fsparam_u32 ("max_batch_time", Opt_max_batch_time), fsparam_u32 ("journal_dev", Opt_journal_dev), fsparam_bdev ("journal_path", Opt_journal_path), fsparam_flag ("journal_checksum", Opt_journal_checksum), fsparam_flag ("nojournal_checksum", Opt_nojournal_checksum), fsparam_flag ("journal_async_commit",Opt_journal_async_commit), fsparam_flag ("abort", Opt_abort), fsparam_enum ("data", Opt_data, ext4_param_data), fsparam_enum ("data_err", Opt_data_err, ext4_param_data_err), fsparam_string_empty ("usrjquota", Opt_usrjquota), fsparam_string_empty ("grpjquota", Opt_grpjquota), fsparam_enum ("jqfmt", Opt_jqfmt, ext4_param_jqfmt), fsparam_flag ("grpquota", Opt_grpquota), fsparam_flag ("quota", Opt_quota), fsparam_flag ("noquota", Opt_noquota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("prjquota", Opt_prjquota), fsparam_flag ("barrier", Opt_barrier), fsparam_u32 ("barrier", Opt_barrier), fsparam_flag ("nobarrier", Opt_nobarrier), fsparam_flag ("i_version", Opt_removed), fsparam_flag ("dax", Opt_dax), fsparam_enum ("dax", Opt_dax_type, ext4_param_dax), fsparam_u32 ("stripe", Opt_stripe), fsparam_flag ("delalloc", Opt_delalloc), fsparam_flag ("nodelalloc", Opt_nodelalloc), fsparam_flag ("warn_on_error", Opt_warn_on_error), fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error), fsparam_u32 ("debug_want_extra_isize", Opt_debug_want_extra_isize), fsparam_flag ("mblk_io_submit", Opt_removed), fsparam_flag ("nomblk_io_submit", Opt_removed), fsparam_flag ("block_validity", Opt_block_validity), fsparam_flag ("noblock_validity", Opt_noblock_validity), fsparam_u32 ("inode_readahead_blks", Opt_inode_readahead_blks), fsparam_u32 ("journal_ioprio", Opt_journal_ioprio), fsparam_u32 ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("auto_da_alloc", Opt_auto_da_alloc), fsparam_flag ("noauto_da_alloc", Opt_noauto_da_alloc), fsparam_flag ("dioread_nolock", Opt_dioread_nolock), fsparam_flag ("nodioread_nolock", Opt_dioread_lock), fsparam_flag ("dioread_lock", Opt_dioread_lock), fsparam_flag ("discard", Opt_discard), fsparam_flag ("nodiscard", Opt_nodiscard), fsparam_u32 ("init_itable", Opt_init_itable), fsparam_flag ("init_itable", Opt_init_itable), fsparam_flag ("noinit_itable", Opt_noinit_itable), #ifdef CONFIG_EXT4_DEBUG fsparam_flag ("fc_debug_force", Opt_fc_debug_force), fsparam_u32 ("fc_debug_max_replay", Opt_fc_debug_max_replay), #endif fsparam_u32 ("max_dir_size_kb", Opt_max_dir_size_kb), fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption), fsparam_flag ("inlinecrypt", Opt_inlinecrypt), fsparam_flag ("nombcache", Opt_nombcache), fsparam_flag ("no_mbcache", Opt_nombcache), /* for backward compatibility */ fsparam_flag ("prefetch_block_bitmaps", Opt_removed), fsparam_flag ("no_prefetch_block_bitmaps", Opt_no_prefetch_block_bitmaps), fsparam_s32 ("mb_optimize_scan", Opt_mb_optimize_scan), fsparam_string ("check", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("nocheck", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("reservation", Opt_removed), /* mount option from ext2/3 */ fsparam_flag ("noreservation", Opt_removed), /* mount option from ext2/3 */ fsparam_u32 ("journal", Opt_removed), /* mount option from ext2/3 */ {} }; #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 #define MOPT_EXPLICIT 0x0008 #ifdef CONFIG_QUOTA #define MOPT_Q 0 #define MOPT_QFMT 0x0010 #else #define MOPT_Q MOPT_NOSUPPORT #define MOPT_QFMT MOPT_NOSUPPORT #endif #define MOPT_NO_EXT2 0x0020 #define MOPT_NO_EXT3 0x0040 #define MOPT_EXT4_ONLY (MOPT_NO_EXT2 | MOPT_NO_EXT3) #define MOPT_SKIP 0x0080 #define MOPT_2 0x0100 static const struct mount_opts { int token; int mount_opt; int flags; } ext4_mount_opts[] = { {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET}, {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR}, {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET}, {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR}, {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET}, {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR}, {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_SET}, {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET}, {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR}, {Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, {Opt_commit, 0, MOPT_NO_EXT2}, {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, {Opt_data_err, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_NO_EXT2}, {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR}, {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR}, {Opt_dax_type, 0, MOPT_EXT4_ONLY}, {Opt_journal_dev, 0, MOPT_NO_EXT2}, {Opt_journal_path, 0, MOPT_NO_EXT2}, {Opt_journal_ioprio, 0, MOPT_NO_EXT2}, {Opt_data, 0, MOPT_NO_EXT2}, {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET}, #ifdef CONFIG_EXT4_FS_POSIX_ACL {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET}, #else {Opt_acl, 0, MOPT_NOSUPPORT}, #endif {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET}, {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET}, {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q}, {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA, MOPT_SET | MOPT_Q}, {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA, MOPT_SET | MOPT_Q}, {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA), MOPT_CLEAR | MOPT_Q}, {Opt_usrjquota, 0, MOPT_Q}, {Opt_grpjquota, 0, MOPT_Q}, {Opt_jqfmt, 0, MOPT_QFMT}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; #if IS_ENABLED(CONFIG_UNICODE) static const struct ext4_sb_encodings { __u16 magic; char *name; unsigned int version; } ext4_sb_encoding_map[] = { {EXT4_ENC_UTF8_12_1, "utf8", UNICODE_AGE(12, 1, 0)}, }; static const struct ext4_sb_encodings * ext4_sb_read_encoding(const struct ext4_super_block *es) { __u16 magic = le16_to_cpu(es->s_encoding); int i; for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++) if (magic == ext4_sb_encoding_map[i].magic) return &ext4_sb_encoding_map[i]; return NULL; } #endif #define EXT4_SPEC_JQUOTA (1 << 0) #define EXT4_SPEC_JQFMT (1 << 1) #define EXT4_SPEC_DATAJ (1 << 2) #define EXT4_SPEC_SB_BLOCK (1 << 3) #define EXT4_SPEC_JOURNAL_DEV (1 << 4) #define EXT4_SPEC_JOURNAL_IOPRIO (1 << 5) #define EXT4_SPEC_s_want_extra_isize (1 << 7) #define EXT4_SPEC_s_max_batch_time (1 << 8) #define EXT4_SPEC_s_min_batch_time (1 << 9) #define EXT4_SPEC_s_inode_readahead_blks (1 << 10) #define EXT4_SPEC_s_li_wait_mult (1 << 11) #define EXT4_SPEC_s_max_dir_size_kb (1 << 12) #define EXT4_SPEC_s_stripe (1 << 13) #define EXT4_SPEC_s_resuid (1 << 14) #define EXT4_SPEC_s_resgid (1 << 15) #define EXT4_SPEC_s_commit_interval (1 << 16) #define EXT4_SPEC_s_fc_debug_max_replay (1 << 17) #define EXT4_SPEC_s_sb_block (1 << 18) #define EXT4_SPEC_mb_optimize_scan (1 << 19) struct ext4_fs_context { char *s_qf_names[EXT4_MAXQUOTAS]; struct fscrypt_dummy_policy dummy_enc_policy; int s_jquota_fmt; /* Format of quota to use */ #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif unsigned short qname_spec; unsigned long vals_s_flags; /* Bits to set in s_flags */ unsigned long mask_s_flags; /* Bits changed in s_flags */ unsigned long journal_devnum; unsigned long s_commit_interval; unsigned long s_stripe; unsigned int s_inode_readahead_blks; unsigned int s_want_extra_isize; unsigned int s_li_wait_mult; unsigned int s_max_dir_size_kb; unsigned int journal_ioprio; unsigned int vals_s_mount_opt; unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; u32 s_min_batch_time; kuid_t s_resuid; kgid_t s_resgid; ext4_fsblk_t s_sb_block; }; static void ext4_fc_free(struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; int i; if (!ctx) return; for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(ctx->s_qf_names[i]); fscrypt_free_dummy_policy(&ctx->dummy_enc_policy); kfree(ctx); } int ext4_init_fs_context(struct fs_context *fc) { struct ext4_fs_context *ctx; ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; fc->fs_private = ctx; fc->ops = &ext4_context_ops; return 0; } #ifdef CONFIG_QUOTA /* * Note the name of the specified quota file. */ static int note_qf_name(struct fs_context *fc, int qtype, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; char *qname; if (param->size < 1) { ext4_msg(NULL, KERN_ERR, "Missing quota name"); return -EINVAL; } if (strchr(param->string, '/')) { ext4_msg(NULL, KERN_ERR, "quotafile must be on filesystem root"); return -EINVAL; } if (ctx->s_qf_names[qtype]) { if (strcmp(ctx->s_qf_names[qtype], param->string) != 0) { ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(qtype)); return -EINVAL; } return 0; } qname = kmemdup_nul(param->string, param->size, GFP_KERNEL); if (!qname) { ext4_msg(NULL, KERN_ERR, "Not enough memory for storing quotafile name"); return -ENOMEM; } ctx->s_qf_names[qtype] = qname; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } /* * Clear the name of the specified quota file. */ static int unnote_qf_name(struct fs_context *fc, int qtype) { struct ext4_fs_context *ctx = fc->fs_private; kfree(ctx->s_qf_names[qtype]); ctx->s_qf_names[qtype] = NULL; ctx->qname_spec |= 1 << qtype; ctx->spec |= EXT4_SPEC_JQUOTA; return 0; } #endif static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, struct ext4_fs_context *ctx) { int err; if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption option not supported"); return -EINVAL; } err = fscrypt_parse_test_dummy_encryption(param, &ctx->dummy_enc_policy); if (err == -EINVAL) { ext4_msg(NULL, KERN_WARNING, "Value of option \"%s\" is unrecognized", param->key); } else if (err == -EEXIST) { ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return err; } #define EXT4_SET_CTX(name) \ static inline __maybe_unused \ void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name |= flag; \ } #define EXT4_CLEAR_CTX(name) \ static inline __maybe_unused \ void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name &= ~flag; \ } #define EXT4_TEST_CTX(name) \ static inline unsigned long \ ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ return (ctx->vals_s_##name & flag); \ } EXT4_SET_CTX(flags); /* set only */ EXT4_SET_CTX(mount_opt); EXT4_CLEAR_CTX(mount_opt); EXT4_TEST_CTX(mount_opt); EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; struct fs_parse_result result; const struct mount_opts *m; int is_remount; int token; token = fs_parse(fc, ext4_param_specs, param, &result); if (token < 0) return token; is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; for (m = ext4_mount_opts; m->token != Opt_err; m++) if (token == m->token) break; ctx->opt_flags |= m->flags; if (m->flags & MOPT_EXPLICIT) { if (m->mount_opt & EXT4_MOUNT_DELALLOC) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_DELALLOC); } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM); } else return -EINVAL; } if (m->flags & MOPT_NOSUPPORT) { ext4_msg(NULL, KERN_ERR, "%s option not supported", param->key); return 0; } switch (token) { #ifdef CONFIG_QUOTA case Opt_usrjquota: if (!*param->string) return unnote_qf_name(fc, USRQUOTA); else return note_qf_name(fc, USRQUOTA, param); case Opt_grpjquota: if (!*param->string) return unnote_qf_name(fc, GRPQUOTA); else return note_qf_name(fc, GRPQUOTA, param); #endif case Opt_sb: if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { ext4_msg(NULL, KERN_WARNING, "Ignoring %s option on remount", param->key); } else { ctx->s_sb_block = result.uint_32; ctx->spec |= EXT4_SPEC_s_sb_block; } return 0; case Opt_removed: ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); #else ext4_msg(NULL, KERN_ERR, "inline encryption not supported"); #endif return 0; case Opt_errors: ctx_clear_mount_opt(ctx, EXT4_MOUNT_ERRORS_MASK); ctx_set_mount_opt(ctx, result.uint_32); return 0; #ifdef CONFIG_QUOTA case Opt_jqfmt: ctx->s_jquota_fmt = result.uint_32; ctx->spec |= EXT4_SPEC_JQFMT; return 0; #endif case Opt_data: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); ctx_set_mount_opt(ctx, result.uint_32); ctx->spec |= EXT4_SPEC_DATAJ; return 0; case Opt_commit: if (result.uint_32 == 0) result.uint_32 = JBD2_DEFAULT_MAX_COMMIT_AGE; else if (result.uint_32 > INT_MAX / HZ) { ext4_msg(NULL, KERN_ERR, "Invalid commit interval %d, " "must be smaller than %d", result.uint_32, INT_MAX / HZ); return -EINVAL; } ctx->s_commit_interval = HZ * result.uint_32; ctx->spec |= EXT4_SPEC_s_commit_interval; return 0; case Opt_debug_want_extra_isize: if ((result.uint_32 & 1) || (result.uint_32 < 4)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", result.uint_32); return -EINVAL; } ctx->s_want_extra_isize = result.uint_32; ctx->spec |= EXT4_SPEC_s_want_extra_isize; return 0; case Opt_max_batch_time: ctx->s_max_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_batch_time; return 0; case Opt_min_batch_time: ctx->s_min_batch_time = result.uint_32; ctx->spec |= EXT4_SPEC_s_min_batch_time; return 0; case Opt_inode_readahead_blks: if (result.uint_32 && (result.uint_32 > (1 << 30) || !is_power_of_2(result.uint_32))) { ext4_msg(NULL, KERN_ERR, "EXT4-fs: inode_readahead_blks must be " "0 or a power of 2 smaller than 2^31"); return -EINVAL; } ctx->s_inode_readahead_blks = result.uint_32; ctx->spec |= EXT4_SPEC_s_inode_readahead_blks; return 0; case Opt_init_itable: ctx_set_mount_opt(ctx, EXT4_MOUNT_INIT_INODE_TABLE); ctx->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; if (param->type == fs_value_is_string) ctx->s_li_wait_mult = result.uint_32; ctx->spec |= EXT4_SPEC_s_li_wait_mult; return 0; case Opt_max_dir_size_kb: ctx->s_max_dir_size_kb = result.uint_32; ctx->spec |= EXT4_SPEC_s_max_dir_size_kb; return 0; #ifdef CONFIG_EXT4_DEBUG case Opt_fc_debug_max_replay: ctx->s_fc_debug_max_replay = result.uint_32; ctx->spec |= EXT4_SPEC_s_fc_debug_max_replay; return 0; #endif case Opt_stripe: ctx->s_stripe = result.uint_32; ctx->spec |= EXT4_SPEC_s_stripe; return 0; case Opt_resuid: ctx->s_resuid = result.uid; ctx->spec |= EXT4_SPEC_s_resuid; return 0; case Opt_resgid: ctx->s_resgid = result.gid; ctx->spec |= EXT4_SPEC_s_resgid; return 0; case Opt_journal_dev: if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } ctx->journal_devnum = result.uint_32; ctx->spec |= EXT4_SPEC_JOURNAL_DEV; return 0; case Opt_journal_path: { struct inode *journal_inode; struct path path; int error; if (is_remount) { ext4_msg(NULL, KERN_ERR, "Cannot specify journal on remount"); return -EINVAL; } error = fs_lookup_param(fc, param, 1, LOOKUP_FOLLOW, &path); if (error) { ext4_msg(NULL, KERN_ERR, "error: could not find " "journal device path"); return -EINVAL; } journal_inode = d_inode(path.dentry); ctx->journal_devnum = new_encode_dev(journal_inode->i_rdev); ctx->spec |= EXT4_SPEC_JOURNAL_DEV; path_put(&path); return 0; } case Opt_journal_ioprio: if (result.uint_32 > 7) { ext4_msg(NULL, KERN_ERR, "Invalid journal IO priority" " (must be 0-7)"); return -EINVAL; } ctx->journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, result.uint_32); ctx->spec |= EXT4_SPEC_JOURNAL_IOPRIO; return 0; case Opt_test_dummy_encryption: return ext4_parse_test_dummy_encryption(param, ctx); case Opt_dax: case Opt_dax_type: #ifdef CONFIG_FS_DAX { int type = (token == Opt_dax) ? Opt_dax : result.uint_32; switch (type) { case Opt_dax: case Opt_dax_always: ctx_set_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); break; case Opt_dax_never: ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); break; case Opt_dax_inode: ctx_clear_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS); ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER); /* Strictly for printing options */ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE); break; } return 0; } #else ext4_msg(NULL, KERN_INFO, "dax option not supported"); return -EINVAL; #endif case Opt_data_err: if (result.uint_32 == Opt_data_err_abort) ctx_set_mount_opt(ctx, m->mount_opt); else if (result.uint_32 == Opt_data_err_ignore) ctx_clear_mount_opt(ctx, m->mount_opt); return 0; case Opt_mb_optimize_scan: if (result.int_32 == 1) { ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else if (result.int_32 == 0) { ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN); ctx->spec |= EXT4_SPEC_mb_optimize_scan; } else { ext4_msg(NULL, KERN_WARNING, "mb_optimize_scan should be set to 0 or 1."); return -EINVAL; } return 0; } /* * At this point we should only be getting options requiring MOPT_SET, * or MOPT_CLEAR. Anything else is a bug */ if (m->token == Opt_err) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } else { unsigned int set = 0; if ((param->type == fs_value_is_flag) || result.uint_32 > 0) set = 1; if (m->flags & MOPT_CLEAR) set = !set; else if (unlikely(!(m->flags & MOPT_SET))) { ext4_msg(NULL, KERN_WARNING, "buggy handling of option %s", param->key); WARN_ON(1); return -EINVAL; } if (m->flags & MOPT_2) { if (set != 0) ctx_set_mount_opt2(ctx, m->mount_opt); else ctx_clear_mount_opt2(ctx, m->mount_opt); } else { if (set != 0) ctx_set_mount_opt(ctx, m->mount_opt); else ctx_clear_mount_opt(ctx, m->mount_opt); } } return 0; } static int parse_options(struct fs_context *fc, char *options) { struct fs_parameter param; int ret; char *key; if (!options) return 0; while ((key = strsep(&options, ",")) != NULL) { if (*key) { size_t v_len = 0; char *value = strchr(key, '='); param.type = fs_value_is_flag; param.string = NULL; if (value) { if (value == key) continue; *value++ = 0; v_len = strlen(value); param.string = kmemdup_nul(value, v_len, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; } param.key = key; param.size = v_len; ret = ext4_parse_param(fc, ¶m); kfree(param.string); if (ret < 0) return ret; } } ret = ext4_validate_options(fc); if (ret < 0) return ret; return 0; } static int parse_apply_sb_mount_options(struct super_block *sb, struct ext4_fs_context *m_ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); char *s_mount_opts = NULL; struct ext4_fs_context *s_ctx = NULL; struct fs_context *fc = NULL; int ret = -ENOMEM; if (!sbi->s_es->s_mount_opts[0]) return 0; s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, sizeof(sbi->s_es->s_mount_opts), GFP_KERNEL); if (!s_mount_opts) return ret; fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); if (!fc) goto out_free; s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!s_ctx) goto out_free; fc->fs_private = s_ctx; fc->s_fs_info = sbi; ret = parse_options(fc, s_mount_opts); if (ret < 0) goto parse_failed; ret = ext4_check_opt_consistency(fc, sb); if (ret < 0) { parse_failed: ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", s_mount_opts); ret = 0; goto out_free; } if (s_ctx->spec & EXT4_SPEC_JOURNAL_DEV) m_ctx->journal_devnum = s_ctx->journal_devnum; if (s_ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO) m_ctx->journal_ioprio = s_ctx->journal_ioprio; ext4_apply_options(fc, sb); ret = 0; out_free: if (fc) { ext4_fc_free(fc); kfree(fc); } kfree(s_mount_opts); return ret; } static void ext4_apply_quota_options(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA bool quota_feature = ext4_has_feature_quota(sb); struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); char *qname; int i; if (quota_feature) return; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; qname = ctx->s_qf_names[i]; /* May be NULL */ if (qname) set_opt(sb, QUOTA); ctx->s_qf_names[i] = NULL; qname = rcu_replace_pointer(sbi->s_qf_names[i], qname, lockdep_is_held(&sb->s_umount)); if (qname) kfree_rcu_mightsleep(qname); } } if (ctx->spec & EXT4_SPEC_JQFMT) sbi->s_jquota_fmt = ctx->s_jquota_fmt; #endif } /* * Check quota settings consistency. */ static int ext4_check_quota_consistency(struct fs_context *fc, struct super_block *sb) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = EXT4_SB(sb); bool quota_feature = ext4_has_feature_quota(sb); bool quota_loaded = sb_any_quota_loaded(sb); bool usr_qf_name, grp_qf_name, usrquota, grpquota; int quota_flags, i; /* * We do the test below only for project quotas. 'usrquota' and * 'grpquota' mount options are allowed even without quota feature * to support legacy quotas in quota files. */ if (ctx_test_mount_opt(ctx, EXT4_MOUNT_PRJQUOTA) && !ext4_has_feature_project(sb)) { ext4_msg(NULL, KERN_ERR, "Project quota feature not enabled. " "Cannot enable project quota enforcement."); return -EINVAL; } quota_flags = EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA | EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA; if (quota_loaded && ctx->mask_s_mount_opt & quota_flags && !ctx_test_mount_opt(ctx, quota_flags)) goto err_quota_change; if (ctx->spec & EXT4_SPEC_JQUOTA) { for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (!(ctx->qname_spec & (1 << i))) continue; if (quota_loaded && !!sbi->s_qf_names[i] != !!ctx->s_qf_names[i]) goto err_jquota_change; if (sbi->s_qf_names[i] && ctx->s_qf_names[i] && strcmp(get_qf_name(sb, sbi, i), ctx->s_qf_names[i]) != 0) goto err_jquota_specified; } if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Journaled quota options ignored when " "QUOTA feature is enabled"); return 0; } } if (ctx->spec & EXT4_SPEC_JQFMT) { if (sbi->s_jquota_fmt != ctx->s_jquota_fmt && quota_loaded) goto err_jquota_change; if (quota_feature) { ext4_msg(NULL, KERN_INFO, "Quota format mount options " "ignored when QUOTA feature is enabled"); return 0; } } /* Make sure we don't mix old and new quota format */ usr_qf_name = (get_qf_name(sb, sbi, USRQUOTA) || ctx->s_qf_names[USRQUOTA]); grp_qf_name = (get_qf_name(sb, sbi, GRPQUOTA) || ctx->s_qf_names[GRPQUOTA]); usrquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || test_opt(sb, USRQUOTA)); grpquota = (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) || test_opt(sb, GRPQUOTA)); if (usr_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); usrquota = false; } if (grp_qf_name) { ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); grpquota = false; } if (usr_qf_name || grp_qf_name) { if (usrquota || grpquota) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } if (!(ctx->spec & EXT4_SPEC_JQFMT || sbi->s_jquota_fmt)) { ext4_msg(NULL, KERN_ERR, "journaled quota format " "not specified"); return -EINVAL; } } return 0; err_quota_change: ext4_msg(NULL, KERN_ERR, "Cannot change quota options when quota turned on"); return -EINVAL; err_jquota_change: ext4_msg(NULL, KERN_ERR, "Cannot change journaled quota " "options when quota turned on"); return -EINVAL; err_jquota_specified: ext4_msg(NULL, KERN_ERR, "%s quota file already specified", QTYPE2NAME(i)); return -EINVAL; #else return 0; #endif } static int ext4_check_test_dummy_encryption(const struct fs_context *fc, struct super_block *sb) { const struct ext4_fs_context *ctx = fc->fs_private; const struct ext4_sb_info *sbi = EXT4_SB(sb); if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy)) return 0; if (!ext4_has_feature_encrypt(sb)) { ext4_msg(NULL, KERN_WARNING, "test_dummy_encryption requires encrypt feature"); return -EINVAL; } /* * This mount option is just for testing, and it's not worthwhile to * implement the extra complexity (e.g. RCU protection) that would be * needed to allow it to be set or changed during remount. We do allow * it to be specified during remount, but only if there is no change. */ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Can't set or change test_dummy_encryption on remount"); return -EINVAL; } /* Also make sure s_mount_opts didn't contain a conflicting value. */ if (fscrypt_is_dummy_policy_set(&sbi->s_dummy_enc_policy)) { if (fscrypt_dummy_policies_equal(&sbi->s_dummy_enc_policy, &ctx->dummy_enc_policy)) return 0; ext4_msg(NULL, KERN_WARNING, "Conflicting test_dummy_encryption options"); return -EINVAL; } return 0; } static void ext4_apply_test_dummy_encryption(struct ext4_fs_context *ctx, struct super_block *sb) { if (!fscrypt_is_dummy_policy_set(&ctx->dummy_enc_policy) || /* if already set, it was already verified to be the same */ fscrypt_is_dummy_policy_set(&EXT4_SB(sb)->s_dummy_enc_policy)) return; EXT4_SB(sb)->s_dummy_enc_policy = ctx->dummy_enc_policy; memset(&ctx->dummy_enc_policy, 0, sizeof(ctx->dummy_enc_policy)); ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); } static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; int err; if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext2"); return -EINVAL; } if ((ctx->opt_flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) { ext4_msg(NULL, KERN_ERR, "Mount option(s) incompatible with ext3"); return -EINVAL; } if (ctx->s_want_extra_isize > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE)) { ext4_msg(NULL, KERN_ERR, "Invalid want_extra_isize %d", ctx->s_want_extra_isize); return -EINVAL; } err = ext4_check_test_dummy_encryption(fc, sb); if (err) return err; if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) { if (!sbi->s_journal) { ext4_msg(NULL, KERN_WARNING, "Remounting file system with no journal " "so ignoring journalled data option"); ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS); } else if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_FLAGS) != test_opt(sb, DATA_FLAGS)) { ext4_msg(NULL, KERN_ERR, "Cannot change data mode " "on remount"); return -EINVAL; } } if (is_remount) { if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) { fail_dax_change_remount: ext4_msg(NULL, KERN_ERR, "can't change " "dax mount option while remounting"); return -EINVAL; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_NEVER) && (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS))) { goto fail_dax_change_remount; } else if (ctx_test_mount_opt2(ctx, EXT4_MOUNT2_DAX_INODE) && ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) || (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) || !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE))) { goto fail_dax_change_remount; } } return ext4_check_quota_consistency(fc, sb); } static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; sbi->s_mount_opt &= ~ctx->mask_s_mount_opt; sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; #define APPLY(X) ({ if (ctx->spec & EXT4_SPEC_##X) sbi->X = ctx->X; }) APPLY(s_commit_interval); APPLY(s_stripe); APPLY(s_max_batch_time); APPLY(s_min_batch_time); APPLY(s_want_extra_isize); APPLY(s_inode_readahead_blks); APPLY(s_max_dir_size_kb); APPLY(s_li_wait_mult); APPLY(s_resgid); APPLY(s_resuid); #ifdef CONFIG_EXT4_DEBUG APPLY(s_fc_debug_max_replay); #endif ext4_apply_quota_options(fc, sb); ext4_apply_test_dummy_encryption(ctx, sb); } static int ext4_validate_options(struct fs_context *fc) { #ifdef CONFIG_QUOTA struct ext4_fs_context *ctx = fc->fs_private; char *usr_qf_name, *grp_qf_name; usr_qf_name = ctx->s_qf_names[USRQUOTA]; grp_qf_name = ctx->s_qf_names[GRPQUOTA]; if (usr_qf_name || grp_qf_name) { if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) && usr_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_USRQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA) && grp_qf_name) ctx_clear_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA); if (ctx_test_mount_opt(ctx, EXT4_MOUNT_USRQUOTA) || ctx_test_mount_opt(ctx, EXT4_MOUNT_GRPQUOTA)) { ext4_msg(NULL, KERN_ERR, "old and new quota " "format mixing"); return -EINVAL; } } #endif return 1; } static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); char *usr_qf_name, *grp_qf_name; if (sbi->s_jquota_fmt) { char *fmtname = ""; switch (sbi->s_jquota_fmt) { case QFMT_VFS_OLD: fmtname = "vfsold"; break; case QFMT_VFS_V0: fmtname = "vfsv0"; break; case QFMT_VFS_V1: fmtname = "vfsv1"; break; } seq_printf(seq, ",jqfmt=%s", fmtname); } rcu_read_lock(); usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]); grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]); if (usr_qf_name) seq_show_option(seq, "usrjquota", usr_qf_name); if (grp_qf_name) seq_show_option(seq, "grpjquota", grp_qf_name); rcu_read_unlock(); #endif } static const char *token2str(int token) { const struct fs_parameter_spec *spec; for (spec = ext4_param_specs; spec->name != NULL; spec++) if (spec->opt == token && !spec->type) break; return spec->name; } /* * Show an option if * - it's set to a non-default value OR * - if the per-sb default is different from the global default */ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, int nodefs) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int def_errors; const struct mount_opts *m; char sep = nodefs ? '\n' : ','; #define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep) #define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg) if (sbi->s_sb_block != 1) SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block); for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; int opt_2 = m->flags & MOPT_2; unsigned int mount_opt, def_mount_opt; if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || m->flags & MOPT_SKIP) continue; if (opt_2) { mount_opt = sbi->s_mount_opt2; def_mount_opt = sbi->s_def_mount_opt2; } else { mount_opt = sbi->s_mount_opt; def_mount_opt = sbi->s_def_mount_opt; } /* skip if same as the default */ if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt))) continue; /* select Opt_noFoo vs Opt_Foo */ if ((want_set && (mount_opt & m->mount_opt) != m->mount_opt) || (!want_set && (mount_opt & m->mount_opt))) continue; SEQ_OPTS_PRINT("%s", token2str(m->token)); } if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) SEQ_OPTS_PRINT("resuid=%u", from_kuid_munged(&init_user_ns, sbi->s_resuid)); if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) SEQ_OPTS_PRINT("resgid=%u", from_kgid_munged(&init_user_ns, sbi->s_resgid)); def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO) SEQ_OPTS_PUTS("errors=remount-ro"); if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE) SEQ_OPTS_PUTS("errors=continue"); if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC) SEQ_OPTS_PUTS("errors=panic"); if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ); if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time); if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time); if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) SEQ_OPTS_PUTS("data=journal"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) SEQ_OPTS_PUTS("data=ordered"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) SEQ_OPTS_PUTS("data=writeback"); } if (nodefs || sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) SEQ_OPTS_PRINT("inode_readahead_blks=%u", sbi->s_inode_readahead_blks); if (test_opt(sb, INIT_INODE_TABLE) && (nodefs || (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT))) SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); if (nodefs || sbi->s_max_dir_size_kb) SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); fscrypt_show_test_dummy_encryption(seq, sep, sb); if (sb->s_flags & SB_INLINECRYPT) SEQ_OPTS_PUTS("inlinecrypt"); if (test_opt(sb, DAX_ALWAYS)) { if (IS_EXT2_SB(sb)) SEQ_OPTS_PUTS("dax"); else SEQ_OPTS_PUTS("dax=always"); } else if (test_opt2(sb, DAX_NEVER)) { SEQ_OPTS_PUTS("dax=never"); } else if (test_opt2(sb, DAX_INODE)) { SEQ_OPTS_PUTS("dax=inode"); } if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD && !test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=0"); } else if (sbi->s_groups_count < MB_DEFAULT_LINEAR_SCAN_THRESHOLD && test_opt2(sb, MB_OPTIMIZE_SCAN)) { SEQ_OPTS_PUTS("mb_optimize_scan=1"); } if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) SEQ_OPTS_PUTS("prefetch_block_bitmaps"); ext4_show_quota_options(seq, sb); return 0; } static int ext4_show_options(struct seq_file *seq, struct dentry *root) { return _ext4_show_options(seq, root->d_sb, 0); } int ext4_seq_options_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; int rc; seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw"); rc = _ext4_show_options(seq, sb, 1); seq_putc(seq, '\n'); return rc; } static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); err = -EROFS; goto done; } if (read_only) goto done; if (!(sbi->s_mount_state & EXT4_VALID_FS)) ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " "running e2fsck is recommended"); else if (sbi->s_mount_state & EXT4_ERROR_FS) ext4_msg(sb, KERN_WARNING, "warning: mounting fs with errors, " "running e2fsck is recommended"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) ext4_msg(sb, KERN_WARNING, "warning: maximal mount count reached, " "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && (ext4_get_tstamp(es, s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds())) ext4_msg(sb, KERN_WARNING, "warning: checktime reached, " "running e2fsck is recommended"); if (!sbi->s_journal) es->s_state &= cpu_to_le16(~EXT4_VALID_FS); if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); if (sbi->s_journal) { ext4_set_feature_journal_needs_recovery(sb); if (ext4_has_feature_orphan_file(sb)) ext4_set_feature_orphan_present(sb); } err = ext4_commit_super(sb); done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n", sb->s_blocksize, sbi->s_groups_count, EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt, sbi->s_mount_opt2); return err; } int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct flex_groups **old_groups, **new_groups; int size, i, j; if (!sbi->s_log_groups_per_flex) return 0; size = ext4_flex_group(sbi, ngroup - 1) + 1; if (size <= sbi->s_flex_groups_allocated) return 0; new_groups = kvzalloc(roundup_pow_of_two(size * sizeof(*sbi->s_flex_groups)), GFP_KERNEL); if (!new_groups) { ext4_msg(sb, KERN_ERR, "not enough memory for %d flex group pointers", size); return -ENOMEM; } for (i = sbi->s_flex_groups_allocated; i < size; i++) { new_groups[i] = kvzalloc(roundup_pow_of_two( sizeof(struct flex_groups)), GFP_KERNEL); if (!new_groups[i]) { for (j = sbi->s_flex_groups_allocated; j < i; j++) kvfree(new_groups[j]); kvfree(new_groups); ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", size); return -ENOMEM; } } rcu_read_lock(); old_groups = rcu_dereference(sbi->s_flex_groups); if (old_groups) memcpy(new_groups, old_groups, (sbi->s_flex_groups_allocated * sizeof(struct flex_groups *))); rcu_read_unlock(); rcu_assign_pointer(sbi->s_flex_groups, new_groups); sbi->s_flex_groups_allocated = size; if (old_groups) ext4_kvfree_array_rcu(old_groups); return 0; } static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; struct flex_groups *fg; ext4_group_t flex_group; int i, err; sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { sbi->s_log_groups_per_flex = 0; return 1; } err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); if (err) goto failed; for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes); atomic64_add(ext4_free_group_clusters(sb, gdp), &fg->free_clusters); atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs); } return 1; failed: return 0; } static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { int offset = offsetof(struct ext4_group_desc, bg_checksum); __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __u32 csum32; __u16 dummy_csum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; goto out; } /* old crc16 code */ if (!ext4_has_feature_gdt_csum(sb)) return 0; crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size) crc = crc16(crc, (__u8 *)gdp + offset, sbi->s_desc_size - offset); out: return cpu_to_le16(crc); } int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (ext4_has_group_desc_csum(sb) && (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp))) return 0; return 1; } void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (!ext4_has_group_desc_csum(sb)) return; gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp); } /* Called at mount-time, super-block is locked */ static int ext4_check_descriptors(struct super_block *sb, ext4_fsblk_t sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); ext4_fsblk_t last_block; ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0); ext4_fsblk_t block_bitmap; ext4_fsblk_t inode_bitmap; ext4_fsblk_t inode_table; int flexbg_flag = 0; ext4_group_t i, grp = sbi->s_groups_count; if (ext4_has_feature_flex_bg(sb)) flexbg_flag = 1; ext4_debug("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (i == sbi->s_groups_count - 1 || flexbg_flag) last_block = ext4_blocks_count(sbi->s_es) - 1; else last_block = first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); if ((grp == sbi->s_groups_count) && !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) grp = i; block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap >= sb_block + 1 && block_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (block_bitmap < first_block || block_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " "(block %llu)!", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap >= sb_block + 1 && inode_bitmap <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_bitmap < first_block || inode_bitmap > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " "(block %llu)!", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); if (inode_table == sb_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "superblock", i); if (!sb_rdonly(sb)) return 0; } if (inode_table >= sb_block + 1 && inode_table <= last_bg_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u overlaps " "block group descriptors", i); if (!sb_rdonly(sb)) return 0; } if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u not in group " "(block %llu)!", i, inode_table); return 0; } ext4_lock_group(sb, i); if (!ext4_group_desc_csum_verify(sb, i, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)", i, le16_to_cpu(ext4_group_desc_csum(sb, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!sb_rdonly(sb)) { ext4_unlock_group(sb, i); return 0; } } ext4_unlock_group(sb, i); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } if (NULL != first_not_zeroed) *first_not_zeroed = grp; return 1; } /* * Maximal extent format file size. * Resulting logical blkno at s_maxbytes must fit in our on-disk * extent format containers, within a sector_t, and within i_blocks * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, * so that won't be a limiting factor. * * However there is other limiting factor. We do store extents in the form * of starting block and length, hence the resulting length of the extent * covering maximum file size must fit into on-disk format containers as * well. Given that length is always by 1 unit bigger than max unit (because * we count 0 as well) we have to lower the s_maxbytes by one fs block. * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64)); if (!has_huge_files) { upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (blkbits - 9); upper_limit <<= blkbits; } /* * 32-bit extent-start container, ee_block. We lower the maxbytes * by one fs block, so ee_len can cover the extent of maximum file * size */ res = (1LL << 32) - 1; res <<= blkbits; /* Sanity check against vm- & vfs- imposed limits */ if (res > upper_limit) res = upper_limit; return res; } /* * Maximal bitmap file size. There is a direct, and {,double-,triple-}indirect * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; unsigned int ppb = 1 << (bits - 2); /* * This is calculated to be the largest file size for a dense, block * mapped file such that the file's total number of 512-byte sectors, * including data and all indirect blocks, does not exceed (2^48 - 1). * * __u32 i_blocks_lo and _u16 i_blocks_high represent the total * number of 512-byte sectors of the file. */ if (!has_huge_files) { /* * !has_huge_files or implies that the inode i_block field * represents total file blocks in 2^32 512-byte sectors == * size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; /* total blocks in file system block size */ upper_limit >>= (bits - 9); } else { /* * We use 48 bit ext4_inode i_blocks * With EXT4_HUGE_FILE_FL set the i_blocks * represent total number of blocks in * file system block size */ upper_limit = (1LL << 48) - 1; } /* Compute how many blocks we can address by block tree */ res += ppb; res += ppb * ppb; res += ((loff_t)ppb) * ppb * ppb; /* Compute how many metadata blocks are needed */ meta_blocks = 1; meta_blocks += 1 + ppb; meta_blocks += 1 + ppb + ppb * ppb; /* Does block tree limit file size? */ if (res + meta_blocks <= upper_limit) goto check_lfs; res = upper_limit; /* How many metadata blocks are needed for addressing upper_limit? */ upper_limit -= EXT4_NDIR_BLOCKS; /* indirect blocks */ meta_blocks = 1; upper_limit -= ppb; /* double indirect blocks */ if (upper_limit < ppb * ppb) { meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb); res -= meta_blocks; goto check_lfs; } meta_blocks += 1 + ppb; upper_limit -= ppb * ppb; /* tripple indirect blocks for the rest */ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) + DIV_ROUND_UP_ULL(upper_limit, ppb*ppb); res -= meta_blocks; check_lfs: res <<= bits; if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; return res; } static ext4_fsblk_t descriptor_loc(struct super_block *sb, ext4_fsblk_t logical_sb_block, int nr) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t bg, first_meta_bg; int has_super = 0; first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg) return logical_sb_block + nr + 1; bg = sbi->s_desc_per_block * nr; if (ext4_bg_has_super(sb, bg)) has_super = 1; /* * If we have a meta_bg fs with 1k blocks, group 0's GDT is at * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled * on modern mke2fs or blksize > 1k on older mke2fs) then we must * compensate. */ if (sb->s_blocksize == 1024 && nr == 0 && le32_to_cpu(sbi->s_es->s_first_data_block) == 0) has_super++; return (has_super + ext4_group_first_block_no(sb, bg)); } /** * ext4_get_stripe_size: Get the stripe size. * @sbi: In memory super block info * * If we have specified it via mount option, then * use the mount option value. If the value specified at mount time is * greater than the blocks per group use the super block value. * If the super block value is greater than blocks per group return 0. * Allocator needs it be less than blocks per group. * */ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) { unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); unsigned long stripe_width = le32_to_cpu(sbi->s_es->s_raid_stripe_width); int ret; if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) ret = sbi->s_stripe; else if (stripe_width && stripe_width <= sbi->s_blocks_per_group) ret = stripe_width; else if (stride && stride <= sbi->s_blocks_per_group) ret = stride; else ret = 0; /* * If the stripe width is 1, this makes no sense and * we set it to 0 to turn off stripe handling code. */ if (ret <= 1) ret = 0; return ret; } /* * Check whether this filesystem can be mounted based on * the features present and the RDONLY/RDWR mount requested. * Returns 1 if this filesystem can be mounted as requested, * 0 if it cannot be. */ int ext4_feature_set_ok(struct super_block *sb, int readonly) { if (ext4_has_unknown_ext4_incompat_features(sb)) { ext4_msg(sb, KERN_ERR, "Couldn't mount because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & ~EXT4_FEATURE_INCOMPAT_SUPP)); return 0; } if (!IS_ENABLED(CONFIG_UNICODE) && ext4_has_feature_casefold(sb)) { ext4_msg(sb, KERN_ERR, "Filesystem with casefold feature cannot be " "mounted without CONFIG_UNICODE"); return 0; } if (readonly) return 1; if (ext4_has_feature_readonly(sb)) { ext4_msg(sb, KERN_INFO, "filesystem is read-only"); sb->s_flags |= SB_RDONLY; return 1; } /* Check that feature set is OK for a read-write mount */ if (ext4_has_unknown_ext4_ro_compat_features(sb)) { ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & ~EXT4_FEATURE_RO_COMPAT_SUPP)); return 0; } if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { ext4_msg(sb, KERN_ERR, "Can't support bigalloc feature without " "extents feature\n"); return 0; } #if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) if (!readonly && (ext4_has_feature_quota(sb) || ext4_has_feature_project(sb))) { ext4_msg(sb, KERN_ERR, "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2"); return 0; } #endif /* CONFIG_QUOTA */ return 1; } /* * This function is called once a day if we have errors logged * on the file system */ static void print_daily_error_info(struct timer_list *t) { struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report); struct super_block *sb = sbi->s_sb; struct ext4_super_block *es = sbi->s_es; if (es->s_error_count) /* fsck newer than v1.41.13 is needed to clean this condition. */ ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u", le32_to_cpu(es->s_error_count)); if (es->s_first_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_first_error_time), (int) sizeof(es->s_first_error_func), es->s_first_error_func, le32_to_cpu(es->s_first_error_line)); if (es->s_first_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_first_error_ino)); if (es->s_first_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_first_error_block)); printk(KERN_CONT "\n"); } if (es->s_last_error_time) { printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d", sb->s_id, ext4_get_tstamp(es, s_last_error_time), (int) sizeof(es->s_last_error_func), es->s_last_error_func, le32_to_cpu(es->s_last_error_line)); if (es->s_last_error_ino) printk(KERN_CONT ": inode %u", le32_to_cpu(es->s_last_error_ino)); if (es->s_last_error_block) printk(KERN_CONT ": block %llu", (unsigned long long) le64_to_cpu(es->s_last_error_block)); printk(KERN_CONT "\n"); } mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ } /* Find next suitable group and run ext4_init_inode_table */ static int ext4_run_li_request(struct ext4_li_request *elr) { struct ext4_group_desc *gdp = NULL; struct super_block *sb = elr->lr_super; ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; ext4_group_t group = elr->lr_next_group; unsigned int prefetch_ios = 0; int ret = 0; int nr = EXT4_SB(sb)->s_mb_prefetch; u64 start_time; if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr); trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr); if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { elr->lr_next_group = elr->lr_first_not_zeroed; elr->lr_mode = EXT4_LI_MODE_ITABLE; ret = 0; } } return ret; } for (; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { ret = 1; break; } if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } if (group >= ngroups) ret = 1; if (!ret) { start_time = ktime_get_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) * EXT4_SB(elr->lr_super)->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } return ret; } /* * Remove lr_request from the list_request and free the * request structure. Should be called with li_list_mtx held */ static void ext4_remove_li_request(struct ext4_li_request *elr) { if (!elr) return; list_del(&elr->lr_request); EXT4_SB(elr->lr_super)->s_li_request = NULL; kfree(elr); } static void ext4_unregister_li_request(struct super_block *sb) { mutex_lock(&ext4_li_mtx); if (!ext4_li_info) { mutex_unlock(&ext4_li_mtx); return; } mutex_lock(&ext4_li_info->li_list_mtx); ext4_remove_li_request(EXT4_SB(sb)->s_li_request); mutex_unlock(&ext4_li_info->li_list_mtx); mutex_unlock(&ext4_li_mtx); } static struct task_struct *ext4_lazyinit_task; /* * This is the function where ext4lazyinit thread lives. It walks * through the request list searching for next scheduled filesystem. * When such a fs is found, run the lazy initialization request * (ext4_rn_li_request) and keep track of the time spend in this * function. Based on that time we compute next schedule time of * the request. When walking through the list is complete, compute * next waking time and put itself into sleep. */ static int ext4_lazyinit_thread(void *arg) { struct ext4_lazy_init *eli = arg; struct list_head *pos, *n; struct ext4_li_request *elr; unsigned long next_wakeup, cur; BUG_ON(NULL == eli); set_freezable(); cont_thread: while (true) { bool next_wakeup_initialized = false; next_wakeup = 0; mutex_lock(&eli->li_list_mtx); if (list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); goto exit_thread; } list_for_each_safe(pos, n, &eli->li_request_list) { int err = 0; int progress = 0; elr = list_entry(pos, struct ext4_li_request, lr_request); if (time_before(jiffies, elr->lr_next_sched)) { if (!next_wakeup_initialized || time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; next_wakeup_initialized = true; } continue; } if (down_read_trylock(&elr->lr_super->s_umount)) { if (sb_start_write_trylock(elr->lr_super)) { progress = 1; /* * We hold sb->s_umount, sb can not * be removed from the list, it is * now safe to drop li_list_mtx */ mutex_unlock(&eli->li_list_mtx); err = ext4_run_li_request(elr); sb_end_write(elr->lr_super); mutex_lock(&eli->li_list_mtx); n = pos->next; } up_read((&elr->lr_super->s_umount)); } /* error, remove the lazy_init job */ if (err) { ext4_remove_li_request(elr); continue; } if (!progress) { elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } if (!next_wakeup_initialized || time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; next_wakeup_initialized = true; } } mutex_unlock(&eli->li_list_mtx); try_to_freeze(); cur = jiffies; if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) { cond_resched(); continue; } schedule_timeout_interruptible(next_wakeup - cur); if (kthread_should_stop()) { ext4_clear_request_list(); goto exit_thread; } } exit_thread: /* * It looks like the request list is empty, but we need * to check it under the li_list_mtx lock, to prevent any * additions into it, and of course we should lock ext4_li_mtx * to atomically free the list and ext4_li_info, because at * this point another ext4 filesystem could be registering * new one. */ mutex_lock(&ext4_li_mtx); mutex_lock(&eli->li_list_mtx); if (!list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); mutex_unlock(&ext4_li_mtx); goto cont_thread; } mutex_unlock(&eli->li_list_mtx); kfree(ext4_li_info); ext4_li_info = NULL; mutex_unlock(&ext4_li_mtx); return 0; } static void ext4_clear_request_list(void) { struct list_head *pos, *n; struct ext4_li_request *elr; mutex_lock(&ext4_li_info->li_list_mtx); list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { elr = list_entry(pos, struct ext4_li_request, lr_request); ext4_remove_li_request(elr); } mutex_unlock(&ext4_li_info->li_list_mtx); } static int ext4_run_lazyinit_thread(void) { ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); if (IS_ERR(ext4_lazyinit_task)) { int err = PTR_ERR(ext4_lazyinit_task); ext4_clear_request_list(); kfree(ext4_li_info); ext4_li_info = NULL; printk(KERN_CRIT "EXT4-fs: error %d creating inode table " "initialization thread\n", err); return err; } ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; return 0; } /* * Check whether it make sense to run itable init. thread or not. * If there is at least one uninitialized inode table, return * corresponding group number, else the loop goes through all * groups and return total number of groups. */ static ext4_group_t ext4_has_uninit_itable(struct super_block *sb) { ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count; struct ext4_group_desc *gdp = NULL; if (!ext4_has_group_desc_csum(sb)) return ngroups; for (group = 0; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) continue; if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED))) break; } return group; } static int ext4_li_info_new(void) { struct ext4_lazy_init *eli = NULL; eli = kzalloc(sizeof(*eli), GFP_KERNEL); if (!eli) return -ENOMEM; INIT_LIST_HEAD(&eli->li_request_list); mutex_init(&eli->li_list_mtx); eli->li_state |= EXT4_LAZYINIT_QUIT; ext4_li_info = eli; return 0; } static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, ext4_group_t start) { struct ext4_li_request *elr; elr = kzalloc(sizeof(*elr), GFP_KERNEL); if (!elr) return NULL; elr->lr_super = sb; elr->lr_first_not_zeroed = start; if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { elr->lr_mode = EXT4_LI_MODE_ITABLE; elr->lr_next_group = start; } else { elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; } /* * Randomize first schedule time of the request to * spread the inode table initialization requests * better. */ elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); return elr; } int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_li_request *elr = NULL; ext4_group_t ngroups = sbi->s_groups_count; int ret = 0; mutex_lock(&ext4_li_mtx); if (sbi->s_li_request != NULL) { /* * Reset timeout so it can be computed again, because * s_li_wait_mult might have changed. */ sbi->s_li_request->lr_timeout = 0; goto out; } if (sb_rdonly(sb) || (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; elr = ext4_li_request_new(sb, first_not_zeroed); if (!elr) { ret = -ENOMEM; goto out; } if (NULL == ext4_li_info) { ret = ext4_li_info_new(); if (ret) goto out; } mutex_lock(&ext4_li_info->li_list_mtx); list_add(&elr->lr_request, &ext4_li_info->li_request_list); mutex_unlock(&ext4_li_info->li_list_mtx); sbi->s_li_request = elr; /* * set elr to NULL here since it has been inserted to * the request_list and the removal and free of it is * handled by ext4_clear_request_list from now on. */ elr = NULL; if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { ret = ext4_run_lazyinit_thread(); if (ret) goto out; } out: mutex_unlock(&ext4_li_mtx); if (ret) kfree(elr); return ret; } /* * We do not need to lock anything since this is called on * module unload. */ static void ext4_destroy_lazyinit_thread(void) { /* * If thread exited earlier * there's nothing to be done. */ if (!ext4_li_info || !ext4_lazyinit_task) return; kthread_stop(ext4_lazyinit_task); } static int set_journal_csum_feature_set(struct super_block *sb) { int ret = 1; int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; } else { /* journal checksum v1 */ compat = JBD2_FEATURE_COMPAT_CHECKSUM; incompat = 0; } jbd2_journal_clear_features(sbi->s_journal, JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_CSUM_V3 | JBD2_FEATURE_INCOMPAT_CSUM_V2); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | incompat); } else if (test_opt(sb, JOURNAL_CHECKSUM)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, incompat); jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } else { jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } return ret; } /* * Note: calculating the overhead so we can be compatible with * historical BSD practice is quite difficult in the face of * clusters/bigalloc. This is because multiple metadata blocks from * different block group can end up in the same allocation cluster. * Calculating the exact overhead in the face of clustered allocation * requires either O(all block bitmaps) in memory or O(number of block * groups**2) in time. We will still calculate the superblock for * older file systems --- and if we come across with a bigalloc file * system with zero in s_overhead_clusters the estimate will be close to * correct especially for very large cluster sizes --- but for newer * file systems, it's better to calculate this figure once at mkfs * time, and store it in the superblock. If the superblock value is * present (even for non-bigalloc file systems), we will use it. */ static int count_overhead(struct super_block *sb, ext4_group_t grp, char *buf) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp; ext4_fsblk_t first_block, last_block, b; ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; int has_super = ext4_bg_has_super(sb, grp); if (!ext4_has_feature_bigalloc(sb)) return (has_super + ext4_bg_num_gdb(sb, grp) + (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) + sbi->s_itb_per_group + 2); first_block = le32_to_cpu(sbi->s_es->s_first_data_block) + (grp * EXT4_BLOCKS_PER_GROUP(sb)); last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); b = ext4_block_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_bitmap(sb, gdp); if (b >= first_block && b <= last_block) { ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf); count++; } b = ext4_inode_table(sb, gdp); if (b >= first_block && b + sbi->s_itb_per_group <= last_block) for (j = 0; j < sbi->s_itb_per_group; j++, b++) { int c = EXT4_B2C(sbi, b - first_block); ext4_set_bit(c, buf); count++; } if (i != grp) continue; s = 0; if (ext4_bg_has_super(sb, grp)) { ext4_set_bit(s++, buf); count++; } j = ext4_bg_num_gdb(sb, grp); if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { ext4_error(sb, "Invalid number of block group " "descriptor blocks: %d", j); j = EXT4_BLOCKS_PER_GROUP(sb) - s; } count += j; for (; j > 0; j--) ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; return EXT4_CLUSTERS_PER_GROUP(sb) - ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8); } /* * Compute the overhead and stash it in sbi->s_overhead */ int ext4_calculate_overhead(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct inode *j_inode; unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; char *buf = (char *) get_zeroed_page(GFP_NOFS); if (!buf) return -ENOMEM; /* * Compute the overhead (FS structures). This is constant * for a given filesystem unless the number of block groups * changes so we cache the previous value until it does. */ /* * All of the blocks before first_data_block are overhead */ overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block)); /* * Add the overhead found in each block group */ for (i = 0; i < ngroups; i++) { int blks; blks = count_overhead(sb, i, buf); overhead += blks; if (blks) memset(buf, 0, PAGE_SIZE); cond_resched(); } /* * Add the internal journal blocks whether the journal has been * loaded or not */ if (sbi->s_journal && !sbi->s_journal_bdev_file) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); } else { ext4_msg(sb, KERN_ERR, "can't get journal size"); } } sbi->s_overhead = overhead; smp_wmb(); free_page((unsigned long) buf); return 0; } static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * There's no need to reserve anything when we aren't using extents. * The space estimates are exact, there are no unwritten extents, * hole punching doesn't need new metadata... This is needed especially * to keep ext2/3 backward compatibility. */ if (!ext4_has_feature_extents(sb)) return; /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run * out of space like for example punch hole, or converting * unwritten extents in delalloc path. In most cases such * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ resv_clusters = (ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits); do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); atomic64_set(&sbi->s_resv_clusters, resv_clusters); } static const char *ext4_quota_mode(struct super_block *sb) { #ifdef CONFIG_QUOTA if (!ext4_quota_capable(sb)) return "none"; if (EXT4_SB(sb)->s_journal && ext4_is_quota_journalled(sb)) return "journalled"; else return "writeback"; #else return "disabled"; #endif } static void ext4_setup_csum_trigger(struct super_block *sb, enum ext4_journal_trigger_type type, void (*trigger)( struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size)) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_journal_triggers[type].sb = sb; sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; } static void ext4_free_sbi(struct ext4_sb_info *sbi) { if (!sbi) return; kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) { struct ext4_sb_info *sbi; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return NULL; sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, NULL, NULL); sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) goto err_out; sb->s_fs_info = sbi; sbi->s_sb = sb; return sbi; err_out: fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); return NULL; } static void ext4_set_def_opts(struct super_block *sb, struct ext4_super_block *es) { unsigned long def_mount_opts; /* Set defaults before we parse the mount options */ def_mount_opts = le32_to_cpu(es->s_default_mount_opts); set_opt(sb, INIT_INODE_TABLE); if (def_mount_opts & EXT4_DEFM_DEBUG) set_opt(sb, DEBUG); if (def_mount_opts & EXT4_DEFM_BSDGROUPS) set_opt(sb, GRPID); if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif if (ext4_has_feature_fast_commit(sb)) set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ if (ext4_has_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) set_opt(sb, ORDERED_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) set_opt(sb, WRITEBACK_DATA); if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_PANIC) set_opt(sb, ERRORS_PANIC); else if (le16_to_cpu(es->s_errors) == EXT4_ERRORS_CONTINUE) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); /* block_validity enabled by default; disable with noblock_validity */ set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) set_opt(sb, BARRIER); /* * enable delayed allocation by default * Use -o nodelalloc to turn it off */ if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) && ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sb, DELALLOC); if (sb->s_blocksize <= PAGE_SIZE) set_opt(sb, DIOREAD_NOLOCK); } static int ext4_handle_clustersize(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int clustersize; /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); if (ext4_has_feature_bigalloc(sb)) { if (clustersize < sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "cluster size (%d) smaller than " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) - le32_to_cpu(es->s_log_block_size); } else { if (clustersize != sb->s_blocksize) { ext4_msg(sb, KERN_ERR, "fragment/cluster size (%d) != " "block size (%lu)", clustersize, sb->s_blocksize); return -EINVAL; } if (sbi->s_blocks_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#blocks per group too big: %lu", sbi->s_blocks_per_group); return -EINVAL; } sbi->s_cluster_bits = 0; } sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group); if (sbi->s_clusters_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu", sbi->s_clusters_per_group); return -EINVAL; } if (sbi->s_blocks_per_group != (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) { ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and clusters per group (%lu) inconsistent", sbi->s_blocks_per_group, sbi->s_clusters_per_group); return -EINVAL; } sbi->s_cluster_ratio = clustersize / sb->s_blocksize; /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); return 0; } /* * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. * @sb: super block * TODO: Later add support for bigalloc */ static void ext4_atomic_write_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct block_device *bdev = sb->s_bdev; if (!bdev_can_atomic_write(bdev)) return; if (!ext4_has_feature_extents(sb)) return; sbi->s_awu_min = max(sb->s_blocksize, bdev_atomic_write_unit_min_bytes(bdev)); sbi->s_awu_max = min(sb->s_blocksize, bdev_atomic_write_unit_max_bytes(bdev)); if (sbi->s_awu_min && sbi->s_awu_max && sbi->s_awu_min <= sbi->s_awu_max) { ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u", sbi->s_awu_min, sbi->s_awu_max); } else { sbi->s_awu_min = 0; sbi->s_awu_max = 0; } } static void ext4_fast_commit_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Initialize fast commit stuff */ atomic_set(&sbi->s_fc_subtid, 0); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_q[FC_Q_STAGING]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; sbi->s_fc_replay_state.fc_regions_used = 0; sbi->s_fc_replay_state.fc_regions_valid = 0; sbi->s_fc_replay_state.fc_modified_inodes = NULL; sbi->s_fc_replay_state.fc_modified_inodes_size = 0; sbi->s_fc_replay_state.fc_modified_inodes_used = 0; } static int ext4_inode_info_init(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; } else { sbi->s_inode_size = le16_to_cpu(es->s_inode_size); sbi->s_first_ino = le32_to_cpu(es->s_first_ino); if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { ext4_msg(sb, KERN_ERR, "invalid first ino: %u", sbi->s_first_ino); return -EINVAL; } if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > sb->s_blocksize)) { ext4_msg(sb, KERN_ERR, "unsupported inode size: %d", sbi->s_inode_size); ext4_msg(sb, KERN_ERR, "blocksize: %lu", sb->s_blocksize); return -EINVAL; } /* * i_atime_extra is the last extra field available for * [acm]times in struct ext4_inode. Checking for that * field should suffice to ensure we have extra space * for all three. */ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { sb->s_time_gran = 1; sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; } else { sb->s_time_gran = NSEC_PER_SEC; sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; } sb->s_time_min = EXT4_TIMESTAMP_MIN; } if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; if (ext4_has_feature_extra_isize(sb)) { unsigned v, max = (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE); v = le16_to_cpu(es->s_want_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_want_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; v = le16_to_cpu(es->s_min_extra_isize); if (v > max) { ext4_msg(sb, KERN_ERR, "bad s_min_extra_isize: %d", v); return -EINVAL; } if (sbi->s_want_extra_isize < v) sbi->s_want_extra_isize = v; } } return 0; } #if IS_ENABLED(CONFIG_UNICODE) static int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags = le16_to_cpu(es->s_encoding_flags); if (!ext4_has_feature_casefold(sb) || sb->s_encoding) return 0; encoding_info = ext4_sb_read_encoding(es); if (!encoding_info) { ext4_msg(sb, KERN_ERR, "Encoding requested by superblock is unknown"); return -EINVAL; } encoding = utf8_load(encoding_info->version); if (IS_ERR(encoding)) { ext4_msg(sb, KERN_ERR, "can't mount with superblock charset: %s-%u.%u.%u " "not supported by the kernel. flags: 0x%x.", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); return -EINVAL; } ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: " "%s-%u.%u.%u with flags 0x%hx", encoding_info->name, unicode_major(encoding_info->version), unicode_minor(encoding_info->version), unicode_rev(encoding_info->version), encoding_flags); sb->s_encoding = encoding; sb->s_encoding_flags = encoding_flags; return 0; } #else static inline int ext4_encoding_init(struct super_block *sb, struct ext4_super_block *es) { return 0; } #endif static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); /* Warn if metadata_csum and gdt_csum are both set. */ if (ext4_has_feature_metadata_csum(sb) && ext4_has_feature_gdt_csum(sb)) ext4_warning(sb, "metadata_csum and uninit_bg are " "redundant flags; please run fsck."); /* Check for a known checksum algorithm */ if (!ext4_verify_csum_type(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "unknown checksum algorithm."); return -EINVAL; } ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, ext4_orphan_file_block_trigger); /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { int ret = PTR_ERR(sbi->s_chksum_driver); ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); sbi->s_chksum_driver = NULL; return ret; } /* Check superblock checksum */ if (!ext4_superblock_csum_verify(sb, es)) { ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "invalid superblock checksum. Run e2fsck?"); return -EFSBADCRC; } /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } static int ext4_check_feature_compatibility(struct super_block *sb, struct ext4_super_block *es, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || ext4_has_incompat_features(sb))) ext4_msg(sb, KERN_WARNING, "feature flags set on rev 0 fs, " "running e2fsck is recommended"); if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { set_opt2(sb, HURD_COMPAT); if (ext4_has_feature_64bit(sb)) { ext4_msg(sb, KERN_ERR, "The Hurd can't support 64-bit file systems"); return -EINVAL; } /* * ea_inode feature uses l_i_version field which is not * available in HURD_COMPAT mode. */ if (ext4_has_feature_ea_inode(sb)) { ext4_msg(sb, KERN_ERR, "ea_inode feature is not supported for Hurd"); return -EINVAL; } } if (IS_EXT2_SB(sb)) { if (ext2_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext2 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext[34] filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due " "to feature incompatibilities"); return -EINVAL; } } if (IS_EXT3_SB(sb)) { if (ext3_feature_set_ok(sb)) ext4_msg(sb, KERN_INFO, "mounting ext3 file system " "using the ext4 subsystem"); else { /* * If we're probing be silent, if this looks like * it's actually an ext4 filesystem. */ if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb))) return -EINVAL; ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due " "to feature incompatibilities"); return -EINVAL; } } /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. */ if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) return -EINVAL; if (sbi->s_daxdev) { if (sb->s_blocksize == PAGE_SIZE) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); else ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); } if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { if (ext4_has_feature_inline_data(sb)) { ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" " that may contain inline data"); return -EINVAL; } if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device."); return -EINVAL; } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", es->s_encryption_level); return -EINVAL; } return 0; } static int ext4_check_geometry(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u64 blocks_count; int err; if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d", le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); return -EINVAL; } /* * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ err = generic_check_addressable(sb->s_blocksize_bits, ext4_blocks_count(es)); if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); return err; } /* check blocks count against device size */ blocks_count = sb_bdev_nr_blocks(sb); if (blocks_count && ext4_blocks_count(es) > blocks_count) { ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " "exceeds size of device (%llu blocks)", ext4_blocks_count(es), blocks_count); return -EINVAL; } /* * It makes no sense for the first data block to be beyond the end * of the filesystem. */ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block %u is beyond end of filesystem (%llu)", le32_to_cpu(es->s_first_data_block), ext4_blocks_count(es)); return -EINVAL; } if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) && (sbi->s_cluster_ratio == 1)) { ext4_msg(sb, KERN_WARNING, "bad geometry: first data " "block is 0 with a 1k block and cluster size"); return -EINVAL; } blocks_count = (ext4_blocks_count(es) - le32_to_cpu(es->s_first_data_block) + EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " "(block count %llu, first data block %u, " "blocks per group %lu)", blocks_count, ext4_blocks_count(es), le32_to_cpu(es->s_first_data_block), EXT4_BLOCKS_PER_GROUP(sb)); return -EINVAL; } sbi->s_groups_count = blocks_count; sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) != le32_to_cpu(es->s_inodes_count)) { ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu", le32_to_cpu(es->s_inodes_count), ((u64)sbi->s_groups_count * sbi->s_inodes_per_group)); return -EINVAL; } return 0; } static int ext4_group_desc_init(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t logical_sb_block, ext4_group_t *first_not_zeroed) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int db_count; ext4_fsblk_t block; int i; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); if (ext4_has_feature_meta_bg(sb)) { if (le32_to_cpu(es->s_first_meta_bg) > db_count) { ext4_msg(sb, KERN_WARNING, "first meta block group too large: %u " "(group descriptor block count %u)", le32_to_cpu(es->s_first_meta_bg), db_count); return -EINVAL; } } rcu_assign_pointer(sbi->s_group_desc, kvmalloc_array(db_count, sizeof(struct buffer_head *), GFP_KERNEL)); if (sbi->s_group_desc == NULL) { ext4_msg(sb, KERN_ERR, "not enough memory"); return -ENOMEM; } bgl_lock_init(sbi->s_blockgroup_lock); /* Pre-read the descriptors into the buffer cache */ for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); ext4_sb_breadahead_unmovable(sb, block); } for (i = 0; i < db_count; i++) { struct buffer_head *bh; block = descriptor_loc(sb, logical_sb_block, i); bh = ext4_sb_bread_unmovable(sb, block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); sbi->s_gdb_count = i; return PTR_ERR(bh); } rcu_read_lock(); rcu_dereference(sbi->s_group_desc)[i] = bh; rcu_read_unlock(); } sbi->s_gdb_count = db_count; if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); return -EFSCORRUPTED; } return 0; } static int ext4_load_and_init_journal(struct super_block *sb, struct ext4_super_block *es, struct ext4_fs_context *ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err; err = ext4_load_journal(sb, es, ctx->journal_devnum); if (err) return err; if (ext4_has_feature_64bit(sb) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); goto out; } if (!set_journal_csum_feature_set(sb)) { ext4_msg(sb, KERN_ERR, "Failed to set journal checksum " "feature set"); goto out; } if (test_opt2(sb, JOURNAL_FAST_COMMIT) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_FAST_COMMIT)) { ext4_msg(sb, KERN_ERR, "Failed to set fast commit journal feature"); goto out; } /* We have now updated the journal if required, so we can * validate the data journaling mode. */ switch (test_opt(sb, DATA_FLAGS)) { case 0: /* No mode set, assume a default based on the journal * capabilities: ORDERED_DATA if the journal can * cope, else JOURNAL_DATA */ if (jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { set_opt(sb, ORDERED_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA; } else { set_opt(sb, JOURNAL_DATA); sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA; } break; case EXT4_MOUNT_ORDERED_DATA: case EXT4_MOUNT_WRITEBACK_DATA: if (!jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { ext4_msg(sb, KERN_ERR, "Journal does not support " "requested data journaling mode"); goto out; } break; default: break; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit in data=ordered mode"); goto out; } set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; sbi->s_journal->j_finish_inode_data_buffers = ext4_journal_finish_inode_data_buffers; return 0; out: /* flush s_sb_upd_work before destroying the journal. */ flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; return -EINVAL; } static int ext4_check_journal_data_mode(struct super_block *sb) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " "data=journal disables delayed allocation, " "dioread_nolock, O_DIRECT and fast_commit support!\n"); /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); clear_opt2(sb, JOURNAL_FAST_COMMIT); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); return -EINVAL; } if (test_opt(sb, DAX_ALWAYS)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); return -EINVAL; } if (ext4_has_feature_encrypt(sb)) { ext4_msg(sb, KERN_WARNING, "encrypted files will use data=ordered " "instead of data journaling mode"); } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } else { sb->s_iflags |= SB_I_CGROUPWB; } return 0; } static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es; ext4_fsblk_t logical_sb_block; unsigned long offset = 0; struct buffer_head *bh; int ret = -EINVAL; int blocksize; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { ext4_msg(sb, KERN_ERR, "unable to set blocksize"); return -EINVAL; } /* * The ext4 superblock will not be buffer aligned for other than 1kB * block sizes. We need to calculate the offset from buffer start. */ if (blocksize != EXT4_MIN_BLOCK_SIZE) { logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); } else { logical_sb_block = sbi->s_sb_block; } bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); return PTR_ERR(bh); } /* * Note: s_es must be initialized as soon as possible because * some ext4 macro-instructions depend on its value */ es = (struct ext4_super_block *) (bh->b_data + offset); sbi->s_es = es; sb->s_magic = le16_to_cpu(es->s_magic); if (sb->s_magic != EXT4_SUPER_MAGIC) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto out; } if (le32_to_cpu(es->s_log_block_size) > (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log block size: %u", le32_to_cpu(es->s_log_block_size)); goto out; } if (le32_to_cpu(es->s_log_cluster_size) > (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, "Invalid log cluster size: %u", le32_to_cpu(es->s_log_cluster_size)); goto out; } blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); /* * If the default block size is not the same as the real block size, * we need to reload it. */ if (sb->s_blocksize == blocksize) { *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; } /* * bh must be released before kill_bdev(), otherwise * it won't be freed and its page also. kill_bdev() * is called by sb_set_blocksize(). */ brelse(bh); /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); bh = NULL; goto out; } logical_sb_block = sbi->s_sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = ext4_sb_bread_unmovable(sb, logical_sb_block); if (IS_ERR(bh)) { ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); ret = PTR_ERR(bh); bh = NULL; goto out; } es = (struct ext4_super_block *)(bh->b_data + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); goto out; } *lsb = logical_sb_block; sbi->s_sbh = bh; return 0; out: brelse(bh); return ret; } static int ext4_hash_info_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; unsigned int i; sbi->s_def_hash_version = es->s_def_hash_version; if (sbi->s_def_hash_version > DX_HASH_LAST) { ext4_msg(sb, KERN_ERR, "Invalid default hash set in the superblock"); return -EINVAL; } else if (sbi->s_def_hash_version == DX_HASH_SIPHASH) { ext4_msg(sb, KERN_ERR, "SIPHASH is not a valid default hash value"); return -EINVAL; } for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); if (ext4_has_feature_dir_index(sb)) { i = le32_to_cpu(es->s_flags); if (i & EXT2_FLAGS_UNSIGNED_HASH) sbi->s_hash_unsigned = 3; else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); sbi->s_hash_unsigned = 3; #else if (!sb_rdonly(sb)) es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif } } return 0; } static int ext4_block_group_meta_init(struct super_block *sb, int silent) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; int has_huge_files; has_huge_files = ext4_has_feature_huge_file(sb); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); sbi->s_desc_size = le16_to_cpu(es->s_desc_size); if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || sbi->s_desc_size > EXT4_MAX_DESC_SIZE || !is_power_of_2(sbi->s_desc_size)) { ext4_msg(sb, KERN_ERR, "unsupported descriptor size %lu", sbi->s_desc_size); return -EINVAL; } } else sbi->s_desc_size = EXT4_MIN_DESC_SIZE; sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { if (!silent) ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); return -EINVAL; } if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || sbi->s_inodes_per_group > sb->s_blocksize * 8) { ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", sbi->s_inodes_per_group); return -EINVAL; } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); return 0; } /* * It's hard to get stripe aligned blocks if stripe is not aligned with * cluster, just disable stripe and alert user to simplify code and avoid * stripe aligned allocation which will rarely succeed. */ static bool ext4_is_stripe_incompatible(struct super_block *sb, unsigned long stripe) { struct ext4_sb_info *sbi = EXT4_SB(sb); return (stripe > 0 && sbi->s_cluster_ratio > 1 && stripe % sbi->s_cluster_ratio != 0); } static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t logical_sb_block; struct inode *root; int needs_recovery; int err; ext4_group_t first_not_zeroed; struct ext4_fs_context *ctx = fc->fs_private; int silent = fc->sb_flags & SB_SILENT; /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); err = ext4_load_super(sb, &logical_sb_block, silent); if (err) goto out_fail; es = sbi->s_es; sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); err = ext4_init_metadata_csum(sb, es); if (err) goto failed_mount; ext4_set_def_opts(sb, es); sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; /* * set default s_li_wait_mult for lazyinit, for the case there is * no mount option specified. */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; err = ext4_inode_info_init(sb, es); if (err) goto failed_mount; err = parse_apply_sb_mount_options(sb, ctx); if (err < 0) goto failed_mount; sbi->s_def_mount_opt = sbi->s_mount_opt; sbi->s_def_mount_opt2 = sbi->s_mount_opt2; err = ext4_check_opt_consistency(fc, sb); if (err < 0) goto failed_mount; ext4_apply_options(fc, sb); err = ext4_encoding_init(sb, es); if (err) goto failed_mount; err = ext4_check_journal_data_mode(sb); if (err) goto failed_mount; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); /* i_version is always enabled now */ sb->s_flags |= SB_I_VERSION; err = ext4_check_feature_compatibility(sb, es, silent); if (err) goto failed_mount; err = ext4_block_group_meta_init(sb, silent); if (err) goto failed_mount; err = ext4_hash_info_init(sb); if (err) goto failed_mount; err = ext4_handle_clustersize(sb); if (err) goto failed_mount; err = ext4_check_geometry(sb, es); if (err) goto failed_mount; timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) goto failed_mount3; err = ext4_es_register_shrinker(sbi); if (err) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); if (ext4_is_stripe_incompatible(sb, sbi->s_stripe)) { ext4_msg(sb, KERN_WARNING, "stripe (%lu) is not aligned with cluster size (%u), " "stripe is disabled", sbi->s_stripe, sbi->s_cluster_ratio); sbi->s_stripe = 0; } sbi->s_extent_max_zeroout_kb = 32; /* * set up enough so that it can read an inode */ sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_FS_ENCRYPTION sb->s_cop = &ext4_cryptops; #endif #ifdef CONFIG_FS_VERITY sb->s_vop = &ext4_verityops; #endif #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); super_set_sysfs_name_bdev(sb); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); spin_lock_init(&sbi->s_bdev_wb_lock); ext4_atomic_write_init(sb); ext4_fast_commit_init(sb); sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) goto failed_mount3a; } err = -EINVAL; /* * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { err = ext4_load_and_init_journal(sb, es, ctx); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); goto failed_mount3a; } else { /* Nojournal mode, all journal mount options are illegal */ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit, fs mounted w/o journal"); goto failed_mount3a; } if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_checksum, fs mounted w/o journal"); goto failed_mount3a; } if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { ext4_msg(sb, KERN_ERR, "can't mount with " "commit=%lu, fs mounted w/o journal", sbi->s_commit_interval / HZ); goto failed_mount3a; } if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { ext4_msg(sb, KERN_ERR, "can't mount with " "data=, fs mounted w/o journal"); goto failed_mount3a; } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); clear_opt2(sb, JOURNAL_FAST_COMMIT); sbi->s_journal = NULL; needs_recovery = 0; } if (!test_opt(sb, NO_MBCACHE)) { sbi->s_ea_block_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_block_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_block_cache"); err = -EINVAL; goto failed_mount_wq; } if (ext4_has_feature_ea_inode(sb)) { sbi->s_ea_inode_cache = ext4_xattr_create_cache(); if (!sbi->s_ea_inode_cache) { ext4_msg(sb, KERN_ERR, "Failed to create ea_inode_cache"); err = -EINVAL; goto failed_mount_wq; } } } /* * Get the # of file system overhead blocks from the * superblock if present. */ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters); /* ignore the precalculated value if it is ridiculous */ if (sbi->s_overhead > ext4_blocks_count(es)) sbi->s_overhead = 0; /* * If the bigalloc feature is not enabled recalculating the * overhead doesn't take long, so we might as well just redo * it to make sure we are using the correct value. */ if (!ext4_has_feature_bigalloc(sb)) sbi->s_overhead = 0; if (sbi->s_overhead == 0) { err = ext4_calculate_overhead(sb); if (err) goto failed_mount_wq; } /* * The maximum number of concurrent works can be high and * concurrency isn't really necessary. Limit it to 1. */ EXT4_SB(sb)->rsv_conversion_wq = alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!EXT4_SB(sb)->rsv_conversion_wq) { printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); err = -ENOMEM; goto failed_mount4; } /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. */ root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL); if (IS_ERR(root)) { ext4_msg(sb, KERN_ERR, "get root inode failed"); err = PTR_ERR(root); root = NULL; goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); iput(root); err = -EFSCORRUPTED; goto failed_mount4; } generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); err = -ENOMEM; goto failed_mount4; } err = ext4_setup_super(sb, es, sb_rdonly(sb)); if (err == -EROFS) { sb->s_flags |= SB_RDONLY; } else if (err) goto failed_mount4a; ext4_set_resv_clusters(sb); if (test_opt(sb, BLOCK_VALIDITY)) { err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); goto failed_mount4a; } } ext4_fc_replay_cleanup(sb); ext4_ext_init(sb); /* * Enable optimize_scan if number of groups is > threshold. This can be * turned off by passing "mb_optimize_scan=0". This can also be * turned on forcefully by passing "mb_optimize_scan=1". */ if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) { if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) set_opt2(sb, MB_OPTIMIZE_SCAN); else clear_opt2(sb, MB_OPTIMIZE_SCAN); } err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); goto failed_mount5; } /* * We can only set up the journal commit callback once * mballoc is initialized */ if (sbi->s_journal) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; err = ext4_percpu_param_init(sbi); if (err) goto failed_mount6; if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); err = -ENOMEM; goto failed_mount6; } err = ext4_register_li_request(sb, first_not_zeroed); if (err) goto failed_mount6; err = ext4_init_orphan_info(sb); if (err) goto failed_mount7; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) goto failed_mount8; } #endif /* CONFIG_QUOTA */ /* * Save the original bdev mapping's wb_err value which could be * used to detect the metadata async write error. */ errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err, &sbi->s_bdev_wb_err); EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; /* * Update the checksum after updating free space/inode counters and * ext4_orphan_cleanup. Otherwise the superblock can have an incorrect * checksum in the buffer cache until it is written out and * e2fsprogs programs trying to open a file system immediately * after it is mounted can fail. */ ext4_superblock_csum_set(sb); if (needs_recovery) { ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) goto failed_mount9; } if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) ext4_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ /* Enable message ratelimiting. Default is 10 messages per 5 secs. */ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); /* Register sysfs after all initializations are complete. */ err = ext4_register_sysfs(sb); if (err) goto failed_mount9; return 0; failed_mount9: ext4_quotas_off(sb, EXT4_MAXQUOTAS); failed_mount8: __maybe_unused ext4_release_orphan_info(sb); failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); failed_mount4a: dput(sb->s_root); sb->s_root = NULL; failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); if (EXT4_SB(sb)->rsv_conversion_wq) destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); failed_mount_wq: ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); sbi->s_ea_inode_cache = NULL; ext4_xattr_destroy_cache(sbi->s_ea_block_cache); sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { /* flush s_sb_upd_work before journal destroy. */ flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: /* flush s_sb_upd_work before sbi destroy */ flush_work(&sbi->s_sb_upd_work); ext4_stop_mmpd(sbi); del_timer_sync(&sbi->s_err_report); ext4_group_desc_free(sbi); failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_QUOTA for (unsigned int i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); brelse(sbi->s_sbh); if (sbi->s_journal_bdev_file) { invalidate_bdev(file_bdev(sbi->s_journal_bdev_file)); bdev_fput(sbi->s_journal_bdev_file); } out_fail: invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; return err; } static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi; const char *descr; int ret; sbi = ext4_alloc_sbi(sb); if (!sbi) return -ENOMEM; fc->s_fs_info = sbi; /* Cleanup superblock name */ strreplace(sb->s_id, '/', '!'); sbi->s_sb_block = 1; /* Default super block location */ if (ctx->spec & EXT4_SPEC_s_sb_block) sbi->s_sb_block = ctx->s_sb_block; ret = __ext4_fill_super(fc, sb); if (ret < 0) goto free_sbi; if (sbi->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) descr = " journalled data mode"; else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) descr = " ordered data mode"; else descr = " writeback data mode"; } else descr = "out journal"; if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) ext4_msg(sb, KERN_INFO, "mounted filesystem %pU %s with%s. " "Quota mode: %s.", &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", descr, ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ ext4_update_overhead(sb, false); return 0; free_sbi: ext4_free_sbi(sbi); fc->s_fs_info = NULL; return ret; } static int ext4_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, ext4_fill_super); } /* * Setup any per-fs journal parameters now. We'll do this both on * initial mount, once the journal has been initialised but before we've * done any recovery; and again on any subsequent remount. */ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) { struct ext4_sb_info *sbi = EXT4_SB(sb); journal->j_commit_interval = sbi->s_commit_interval; journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; ext4_fc_init(sb, journal); write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; if (test_opt(sb, DATA_ERR_ABORT)) journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; else journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; /* * Always enable journal cycle record option, letting the journal * records log transactions continuously between each mount. */ journal->j_flags |= JBD2_CYCLE_RECORD; write_unlock(&journal->j_state_lock); } static struct inode *ext4_get_journal_inode(struct super_block *sb, unsigned int journal_inum) { struct inode *journal_inode; /* * Test for the existence of a valid inode on disk. Bad things * happen if we iget() an unused inode, as the subsequent iput() * will try to delete it. */ journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); return ERR_CAST(journal_inode); } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); ext4_msg(sb, KERN_ERR, "journal inode is deleted"); return ERR_PTR(-EFSCORRUPTED); } if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return ERR_PTR(-EFSCORRUPTED); } ext4_debug("Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); return journal_inode; } static int ext4_journal_bmap(journal_t *journal, sector_t *block) { struct ext4_map_blocks map; int ret; if (journal->j_inode == NULL) return 0; map.m_lblk = *block; map.m_len = 1; ret = ext4_map_blocks(NULL, journal->j_inode, &map, 0); if (ret <= 0) { ext4_msg(journal->j_inode->i_sb, KERN_CRIT, "journal bmap failed: block %llu ret %d\n", *block, ret); jbd2_journal_abort(journal, ret ? ret : -EIO); return ret; } *block = map.m_pblk; return 0; } static journal_t *ext4_open_inode_journal(struct super_block *sb, unsigned int journal_inum) { struct inode *journal_inode; journal_t *journal; journal_inode = ext4_get_journal_inode(sb, journal_inum); if (IS_ERR(journal_inode)) return ERR_CAST(journal_inode); journal = jbd2_journal_init_inode(journal_inode); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); return ERR_CAST(journal); } journal->j_private = sb; journal->j_bmap = ext4_journal_bmap; ext4_init_journal_params(sb, journal); return journal; } static struct file *ext4_get_journal_blkdev(struct super_block *sb, dev_t j_dev, ext4_fsblk_t *j_start, ext4_fsblk_t *j_len) { struct buffer_head *bh; struct block_device *bdev; struct file *bdev_file; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; int errno; bdev_file = bdev_file_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, sb, &fs_holder_ops); if (IS_ERR(bdev_file)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file)); return bdev_file; } bdev = file_bdev(bdev_file); blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { ext4_msg(sb, KERN_ERR, "blocksize too small for journal device"); errno = -EINVAL; goto out_bdev; } sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev_file, blocksize); bh = __bread(bdev, sb_block, blocksize); if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); errno = -EINVAL; goto out_bdev; } es = (struct ext4_super_block *) (bh->b_data + offset); if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); errno = -EFSCORRUPTED; goto out_bh; } if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && es->s_checksum != ext4_superblock_csum(sb, es)) { ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); errno = -EFSCORRUPTED; goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); errno = -EFSCORRUPTED; goto out_bh; } *j_start = sb_block + 1; *j_len = ext4_blocks_count(es); brelse(bh); return bdev_file; out_bh: brelse(bh); out_bdev: bdev_fput(bdev_file); return ERR_PTR(errno); } static journal_t *ext4_open_dev_journal(struct super_block *sb, dev_t j_dev) { journal_t *journal; ext4_fsblk_t j_start; ext4_fsblk_t j_len; struct file *bdev_file; int errno = 0; bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); if (IS_ERR(bdev_file)) return ERR_CAST(bdev_file); journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start, j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); errno = PTR_ERR(journal); goto out_bdev; } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); errno = -EINVAL; goto out_journal; } journal->j_private = sb; EXT4_SB(sb)->s_journal_bdev_file = bdev_file; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: bdev_fput(bdev_file); return ERR_PTR(errno); } static int ext4_load_journal(struct super_block *sb, struct ext4_super_block *es, unsigned long journal_devnum) { journal_t *journal; unsigned int journal_inum = le32_to_cpu(es->s_journal_inum); dev_t journal_dev; int err = 0; int really_read_only; int journal_dev_ro; if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) return -EFSCORRUPTED; if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { ext4_msg(sb, KERN_INFO, "external journal device major/minor " "numbers have changed"); journal_dev = new_decode_dev(journal_devnum); } else journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); if (journal_inum && journal_dev) { ext4_msg(sb, KERN_ERR, "filesystem has both journal inode and journal device!"); return -EINVAL; } if (journal_inum) { journal = ext4_open_inode_journal(sb, journal_inum); if (IS_ERR(journal)) return PTR_ERR(journal); } else { journal = ext4_open_dev_journal(sb, journal_dev); if (IS_ERR(journal)) return PTR_ERR(journal); } journal_dev_ro = bdev_read_only(journal->j_dev); really_read_only = bdev_read_only(sb->s_bdev) | journal_dev_ro; if (journal_dev_ro && !sb_rdonly(sb)) { ext4_msg(sb, KERN_ERR, "journal device read-only, try mounting with '-o ro'"); err = -EROFS; goto err_out; } /* * Are we loading a blank journal or performing recovery after a * crash? For recovery, we need to check in advance whether we * can get read-write access to the device. */ if (ext4_has_feature_journal_needs_recovery(sb)) { if (sb_rdonly(sb)) { ext4_msg(sb, KERN_INFO, "INFO: recovery " "required on readonly filesystem"); if (really_read_only) { ext4_msg(sb, KERN_ERR, "write access " "unavailable, cannot proceed " "(try mounting with noload)"); err = -EROFS; goto err_out; } ext4_msg(sb, KERN_INFO, "write access will " "be enabled during recovery"); } } if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); if (!ext4_has_feature_journal_needs_recovery(sb)) err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); __le16 orig_state; bool changed = false; if (save) memcpy(save, ((char *) es) + EXT4_S_ERR_START, EXT4_S_ERR_LEN); err = jbd2_journal_load(journal); if (save && memcmp(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN)) { memcpy(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN); changed = true; } kfree(save); orig_state = es->s_state; es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS); if (orig_state != es->s_state) changed = true; /* Write out restored error information to the superblock */ if (changed && !really_read_only) { int err2; err2 = ext4_commit_super(sb); err = err ? : err2; } } if (err) { ext4_msg(sb, KERN_ERR, "error loading journal"); goto err_out; } EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { EXT4_SB(sb)->s_journal = NULL; jbd2_journal_destroy(journal); return err; } if (!really_read_only && journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); ext4_commit_super(sb); } if (!really_read_only && journal_inum && journal_inum != le32_to_cpu(es->s_journal_inum)) { es->s_journal_inum = cpu_to_le32(journal_inum); ext4_commit_super(sb); } return 0; err_out: jbd2_journal_destroy(journal); return err; } /* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */ static void ext4_update_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; struct buffer_head *sbh = sbi->s_sbh; lock_buffer(sbh); /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock * write time when we are mounting the root file system * read/only but we need to replay the journal; at that point, * for people who are east of GMT and who make their clock * tick in localtime for Windows bug-for-bug compatibility, * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ if (!sb_rdonly(sb)) ext4_update_tstamp(es, s_wtime); es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1)); if (percpu_counter_initialized(&sbi->s_freeclusters_counter)) ext4_free_blocks_count_set(es, EXT4_C2B(sbi, percpu_counter_sum_positive( &sbi->s_freeclusters_counter))); if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &sbi->s_freeinodes_counter)); /* Copy error information to the on-disk superblock */ spin_lock(&sbi->s_error_lock); if (sbi->s_add_error_count > 0) { es->s_state |= cpu_to_le16(EXT4_ERROR_FS); if (!es->s_first_error_time && !es->s_first_error_time_hi) { __ext4_update_tstamp(&es->s_first_error_time, &es->s_first_error_time_hi, sbi->s_first_error_time); strtomem_pad(es->s_first_error_func, sbi->s_first_error_func, 0); es->s_first_error_line = cpu_to_le32(sbi->s_first_error_line); es->s_first_error_ino = cpu_to_le32(sbi->s_first_error_ino); es->s_first_error_block = cpu_to_le64(sbi->s_first_error_block); es->s_first_error_errcode = ext4_errno_to_code(sbi->s_first_error_code); } __ext4_update_tstamp(&es->s_last_error_time, &es->s_last_error_time_hi, sbi->s_last_error_time); strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0); es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); es->s_last_error_errcode = ext4_errno_to_code(sbi->s_last_error_code); /* * Start the daily error reporting function if it hasn't been * started already */ if (!es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); le32_add_cpu(&es->s_error_count, sbi->s_add_error_count); sbi->s_add_error_count = 0; } spin_unlock(&sbi->s_error_lock); ext4_superblock_csum_set(sb); unlock_buffer(sbh); } static int ext4_commit_super(struct super_block *sb) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; if (!sbh) return -EINVAL; ext4_update_super(sb); lock_buffer(sbh); /* Buffer got discarded which means block device got invalidated */ if (!buffer_mapped(sbh)) { unlock_buffer(sbh); return -EIO; } if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the * superblock failed. This could happen because the * USB device was yanked out. Or it could happen to * be a transient write error and maybe the block will * be remapped. Nothing we can do but to retry the * write and hope for the best. */ ext4_msg(sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } get_bh(sbh); /* Clear potential dirty bit if it was journalled update */ clear_buffer_dirty(sbh); sbh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE | REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); wait_on_buffer(sbh); if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); return -EIO; } return 0; } /* * Have we just finished recovery? If so, and if we are mounting (or * remounting) the filesystem readonly, then we will end up with a * consistent fs on disk. Record that fact. */ static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es) { int err; journal_t *journal = EXT4_SB(sb)->s_journal; if (!ext4_has_feature_journal(sb)) { if (journal != NULL) { ext4_error(sb, "Journal got removed while the fs was " "mounted!"); return -EFSCORRUPTED; } return 0; } jbd2_journal_lock_updates(journal); err = jbd2_journal_flush(journal, 0); if (err < 0) goto out; if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) || ext4_has_feature_orphan_present(sb))) { if (!ext4_orphan_file_empty(sb)) { ext4_error(sb, "Orphan file not empty on read-only fs."); err = -EFSCORRUPTED; goto out; } ext4_clear_feature_journal_needs_recovery(sb); ext4_clear_feature_orphan_present(sb); ext4_commit_super(sb); } out: jbd2_journal_unlock_updates(journal); return err; } /* * If we are mounting (or read-write remounting) a filesystem whose journal * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ static int ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es) { journal_t *journal; int j_errno; const char *errstr; if (!ext4_has_feature_journal(sb)) { ext4_error(sb, "Journal got removed while the fs was mounted!"); return -EFSCORRUPTED; } journal = EXT4_SB(sb)->s_journal; /* * Now check for any error status which may have been recorded in the * journal by a prior ext4_error() or ext4_abort() */ j_errno = jbd2_journal_errno(journal); if (j_errno) { char nbuf[16]; errstr = ext4_decode_error(sb, j_errno, nbuf); ext4_warning(sb, "Filesystem error recorded " "from previous mount: %s", errstr); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); j_errno = ext4_commit_super(sb); if (j_errno) return j_errno; ext4_warning(sb, "Marked fs in need of filesystem check."); jbd2_journal_clear_err(journal); jbd2_journal_update_sb_errno(journal); } return 0; } /* * Force the running and committing transactions to commit, * and wait on the commit. */ int ext4_force_commit(struct super_block *sb) { return ext4_journal_force_commit(EXT4_SB(sb)->s_journal); } static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; tid_t target; bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); if (unlikely(ext4_forced_shutdown(sb))) return -EIO; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); /* * Writeback quota in non-journalled quota case - journalled quota has * no dirty dquots */ dquot_writeback_dquots(sb, -1); /* * Data writeback is possible w/o journal transaction, so barrier must * being sent at the end of the function. But we can skip it if * transaction_commit will do it for us. */ if (sbi->s_journal) { target = jbd2_get_latest_transaction(sbi->s_journal); if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) needs_barrier = true; if (jbd2_journal_start_commit(sbi->s_journal, &target)) { if (wait) ret = jbd2_log_wait_commit(sbi->s_journal, target); } } else if (wait && test_opt(sb, BARRIER)) needs_barrier = true; if (needs_barrier) { int err; err = blkdev_issue_flush(sb->s_bdev); if (!ret) ret = err; } return ret; } /* * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. * * Note that only this function cannot bring a filesystem to be in a clean * state independently. It relies on upper layer to stop all data & metadata * modifications. */ static int ext4_freeze(struct super_block *sb) { int error = 0; journal_t *journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ jbd2_journal_lock_updates(journal); /* * Don't clear the needs_recovery flag if we failed to * flush the journal. */ error = jbd2_journal_flush(journal, 0); if (error < 0) goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); if (ext4_orphan_file_empty(sb)) ext4_clear_feature_orphan_present(sb); } error = ext4_commit_super(sb); out: if (journal) /* we rely on upper layer to stop further updates */ jbd2_journal_unlock_updates(journal); return error; } /* * Called by LVM after the snapshot is done. We need to reset the RECOVER * flag here, even though the filesystem is not technically dirty yet. */ static int ext4_unfreeze(struct super_block *sb) { if (ext4_forced_shutdown(sb)) return 0; if (EXT4_SB(sb)->s_journal) { /* Reset the needs_recovery flag before the fs is unlocked. */ ext4_set_feature_journal_needs_recovery(sb); if (ext4_has_feature_orphan_file(sb)) ext4_set_feature_orphan_present(sb); } ext4_commit_super(sb); return 0; } /* * Structure to save mount options for ext4_remount's benefit */ struct ext4_mount_options { unsigned long s_mount_opt; unsigned long s_mount_opt2; kuid_t s_resuid; kgid_t s_resgid; unsigned long s_commit_interval; u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA int s_jquota_fmt; char *s_qf_names[EXT4_MAXQUOTAS]; #endif }; static int __ext4_remount(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long old_sb_flags; struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; int alloc_ctx; #ifdef CONFIG_QUOTA int enable_quota = 0; int i, j; char *to_free[EXT4_MAXQUOTAS]; #endif /* Store the original options */ old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_mount_opt2 = sbi->s_mount_opt2; old_opts.s_resuid = sbi->s_resuid; old_opts.s_resgid = sbi->s_resgid; old_opts.s_commit_interval = sbi->s_commit_interval; old_opts.s_min_batch_time = sbi->s_min_batch_time; old_opts.s_max_batch_time = sbi->s_max_batch_time; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { char *qf_name = get_qf_name(sb, sbi, i); old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL); if (!old_opts.s_qf_names[i]) { for (j = 0; j < i; j++) kfree(old_opts.s_qf_names[j]); return -ENOMEM; } } else old_opts.s_qf_names[i] = NULL; #endif if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) { if (sbi->s_journal && sbi->s_journal->j_task->io_context) ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; } if ((ctx->spec & EXT4_SPEC_s_stripe) && ext4_is_stripe_incompatible(sb, ctx->s_stripe)) { ext4_msg(sb, KERN_WARNING, "stripe (%lu) is not aligned with cluster size (%u), " "stripe is disabled", ctx->s_stripe, sbi->s_cluster_ratio); ctx->s_stripe = 0; } /* * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause * two calls to ext4_should_dioread_nolock() to return inconsistent * values, triggering WARN_ON in ext4_add_complete_io(). we grab * here s_writepages_rwsem to avoid race between writepages ops and * remount. */ alloc_ctx = ext4_writepages_down_write(sb); ext4_apply_options(fc, sb); ext4_writepages_up_write(sb, alloc_ctx); if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " "during remount not supported; ignoring"); sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); err = -EINVAL; goto restore_opts; } if (test_opt(sb, DIOREAD_NOLOCK)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dioread_nolock"); err = -EINVAL; goto restore_opts; } } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ext4_msg(sb, KERN_ERR, "can't mount with " "journal_async_commit in data=ordered mode"); err = -EINVAL; goto restore_opts; } } if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) { ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount"); err = -EINVAL; goto restore_opts; } if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) && !test_opt(sb, DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount"); err = -EINVAL; goto restore_opts; } sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); es = sbi->s_es; if (sbi->s_journal) { ext4_init_journal_params(sb, sbi->s_journal); set_task_ioprio(sbi->s_journal->j_task, ctx->journal_ioprio); } /* Flush outstanding errors before changing fs state */ flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { if (ext4_forced_shutdown(sb)) { err = -EROFS; goto restore_opts; } if (fc->sb_flags & SB_RDONLY) { err = sync_filesystem(sb); if (err < 0) goto restore_opts; err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; /* * First of all, the unconditional stuff we have to do * to disable replay of the journal when we next remount */ sb->s_flags |= SB_RDONLY; /* * OK, test if we are remounting a valid rw partition * readonly, and if so set the rdonly flag and then * mark the partition as valid again. */ if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) && (sbi->s_mount_state & EXT4_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); if (sbi->s_journal) { /* * We let remount-ro finish even if marking fs * as clean failed... */ ext4_mark_recovery_complete(sb, es); } } else { /* Make sure we can mount this feature set readwrite */ if (ext4_has_feature_readonly(sb) || !ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; } /* * Make sure the group descriptor checksums * are sane. If they aren't, refuse to remount r/w. */ for (g = 0; g < sbi->s_groups_count; g++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, g, NULL); if (!ext4_group_desc_csum_verify(sb, g, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_remount: Checksum for group %u failed (%u!=%u)", g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)), le16_to_cpu(gdp->bg_checksum)); err = -EFSBADCRC; goto restore_opts; } } /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, * require a full umount/remount for now. */ if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) { ext4_msg(sb, KERN_WARNING, "Couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " "umount/remount instead"); err = -EINVAL; goto restore_opts; } /* * Mounting a RDONLY partition read-write, so reread * and store the current valid flag. (It may have * been changed by e2fsck since we originally mounted * the partition.) */ if (sbi->s_journal) { err = ext4_clear_journal_err(sb, es); if (err) goto restore_opts; } sbi->s_mount_state = (le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY); err = ext4_setup_super(sb, es, 0); if (err) goto restore_opts; sb->s_flags &= ~SB_RDONLY; if (ext4_has_feature_mmp(sb)) { err = ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)); if (err) goto restore_opts; } #ifdef CONFIG_QUOTA enable_quota = 1; #endif } } /* * Handle creation of system zone data early because it can fail. * Releasing of existing data is done when we are sure remount will * succeed. */ if (test_opt(sb, BLOCK_VALIDITY) && !sbi->s_system_blks) { err = ext4_setup_system_zone(sb); if (err) goto restore_opts; } if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { err = ext4_commit_super(sb); if (err) goto restore_opts; } #ifdef CONFIG_QUOTA if (enable_quota) { if (sb_any_quota_suspended(sb)) dquot_resume(sb, -1); else if (ext4_has_feature_quota(sb)) { err = ext4_enable_quotas(sb); if (err) goto restore_opts; } } /* Release old quota file names */ for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(old_opts.s_qf_names[i]); #endif if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); /* * Reinitialize lazy itable initialization thread based on * current settings */ if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE)) ext4_unregister_li_request(sb); else { ext4_group_t first_not_zeroed; first_not_zeroed = ext4_has_uninit_itable(sb); ext4_register_li_request(sb, first_not_zeroed); } if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); /* * Handle aborting the filesystem as the last thing during remount to * avoid obsure errors during remount when some option changes fail to * apply due to shutdown filesystem. */ if (test_opt2(sb, ABORT)) ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); return 0; restore_opts: /* * If there was a failing r/w to ro transition, we may need to * re-enable quota */ if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); alloc_ctx = ext4_writepages_down_write(sb); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; ext4_writepages_up_write(sb, alloc_ctx); if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) { to_free[i] = get_qf_name(sb, sbi, i); rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]); } synchronize_rcu(); for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(to_free[i]); #endif if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); return err; } static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; fc->s_fs_info = EXT4_SB(sb); ret = ext4_check_opt_consistency(fc, sb); if (ret < 0) return ret; ret = __ext4_remount(fc, sb); if (ret < 0) return ret; ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", ext4_quota_mode(sb)); return 0; } #ifdef CONFIG_QUOTA static int ext4_statfs_project(struct super_block *sb, kprojid_t projid, struct kstatfs *buf) { struct kqid qid; struct dquot *dquot; u64 limit; u64 curblock; qid = make_kqid_projid(projid); dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; buf->f_blocks = limit; buf->f_bfree = buf->f_bavail = (buf->f_blocks > curblock) ? (buf->f_blocks - curblock) : 0; } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; } spin_unlock(&dquot->dq_dqb_lock); dqput(dquot); return 0; } #endif static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_fsblk_t overhead = 0, resv_blocks; s64 bfree; resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); if (!test_opt(sb, MINIX_DF)) overhead = sbi->s_overhead; buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead); bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) - percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter); /* prevent underflow in case that few free space is available */ buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0)); buf->f_bavail = buf->f_bfree - (ext4_r_blocks_count(es) + resv_blocks); if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks)) buf->f_bavail = 0; buf->f_files = le32_to_cpu(es->s_inodes_count); buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); buf->f_namelen = EXT4_NAME_LEN; buf->f_fsid = uuid_to_fsid(es->s_uuid); #ifdef CONFIG_QUOTA if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && sb_has_quota_limits_enabled(sb, PRJQUOTA)) ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); #endif return 0; } #ifdef CONFIG_QUOTA /* * Helper functions so that transaction is started before we acquire dqio_sem * to keep correct lock ordering of transaction > dqio_sem */ static inline struct inode *dquot_to_inode(struct dquot *dquot) { return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; } static int ext4_write_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; struct inode *inode; inode = dquot_to_inode(dquot); handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit(dquot); if (ret < 0) ext4_error_err(dquot->dq_sb, -ret, "Failed to commit dquot type %d", dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_acquire_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_acquire(dquot); if (ret < 0) ext4_error_err(dquot->dq_sb, -ret, "Failed to acquire dquot type %d", dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_release_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); return PTR_ERR(handle); } ret = dquot_release(dquot); if (ret < 0) ext4_error_err(dquot->dq_sb, -ret, "Failed to release dquot type %d", dquot->dq_id.type); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static int ext4_mark_dquot_dirty(struct dquot *dquot) { struct super_block *sb = dquot->dq_sb; if (ext4_is_quota_journalled(sb)) { dquot_mark_dquot_dirty(dquot); return ext4_write_dquot(dquot); } else { return dquot_mark_dquot_dirty(dquot); } } static int ext4_write_info(struct super_block *sb, int type) { int ret, err; handle_t *handle; /* Data block + inode block */ handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); err = ext4_journal_stop(handle); if (!ret) ret = err; return ret; } static void lockdep_set_quota_inode(struct inode *inode, int subclass) { struct ext4_inode_info *ei = EXT4_I(inode); /* The first argument of lockdep_set_subclass has to be * *exactly* the same as the argument to init_rwsem() --- in * this case, in init_once() --- or lockdep gets unhappy * because the name of the lock is set using the * stringification of the argument to init_rwsem(). */ (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ lockdep_set_subclass(&ei->i_data_sem, subclass); } /* * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { int err; if (!test_opt(sb, QUOTA)) return -EINVAL; /* Quotafile not on the same filesystem? */ if (path->dentry->d_sb != sb) return -EXDEV; /* Quota already enabled for this file? */ if (IS_NOQUOTA(d_inode(path->dentry))) return -EBUSY; /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { /* Quotafile not in fs root? */ if (path->dentry->d_parent != sb->s_root) ext4_msg(sb, KERN_WARNING, "Quota file not on filesystem root. " "Journaled quota will not work"); sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY; } else { /* * Clear the flag just in case mount options changed since * last time. */ sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; } lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); if (!err) { struct inode *inode = d_inode(path->dentry); handle_t *handle; /* * Set inode flags to prevent userspace from messing with quota * files. If this fails, we return success anyway since quotas * are already enabled and this is not a hard failure. */ inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) goto unlock_inode; EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); if (err) dquot_quota_off(sb, type); } if (err) lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_NORMAL); return err; } static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum) { switch (type) { case USRQUOTA: return qf_inum == EXT4_USR_QUOTA_INO; case GRPQUOTA: return qf_inum == EXT4_GRP_QUOTA_INO; case PRJQUOTA: return qf_inum >= EXT4_GOOD_OLD_FIRST_INO; default: BUG(); } } static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags) { int err; struct inode *qf_inode; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; BUG_ON(!ext4_has_feature_quota(sb)); if (!qf_inums[type]) return -EPERM; if (!ext4_check_quota_inum(type, qf_inums[type])) { ext4_error(sb, "Bad quota inum: %lu, type: %d", qf_inums[type], type); return -EUCLEAN; } qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL); if (IS_ERR(qf_inode)) { ext4_error(sb, "Bad quota inode: %lu, type: %d", qf_inums[type], type); return PTR_ERR(qf_inode); } /* Don't account quota for quota files to avoid recursion */ qf_inode->i_flags |= S_NOQUOTA; lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA); err = dquot_load_quota_inode(qf_inode, type, format_id, flags); if (err) lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL); iput(qf_inode); return err; } /* Enable usage tracking for all quota types. */ int ext4_enable_quotas(struct super_block *sb) { int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; bool quota_mopt[EXT4_MAXQUOTAS] = { test_opt(sb, USRQUOTA), test_opt(sb, GRPQUOTA), test_opt(sb, PRJQUOTA), }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d, ino=%lu). " "Please run e2fsck to fix.", type, err, qf_inums[type]); ext4_quotas_off(sb, type); return err; } } } return 0; } static int ext4_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; handle_t *handle; int err; /* Force all delayed allocation blocks to be allocated. * Caller already holds s_umount sem */ if (test_opt(sb, DELALLOC)) sync_filesystem(sb); if (!inode || !igrab(inode)) goto out; err = dquot_quota_off(sb, type); if (err || ext4_has_feature_quota(sb)) goto out_put; /* * When the filesystem was remounted read-only first, we cannot cleanup * inode flags here. Bad luck but people should be using QUOTA feature * these days anyway. */ if (sb_rdonly(sb)) goto out_put; inode_lock(inode); /* * Update modification times of quota files when userspace can * start looking at them. If we fail, we return success anyway since * this is not a hard failure and quotas are already disabled. */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_unlock; } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: inode_unlock(inode); out_put: lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL); iput(inode); return err; out: return dquot_quota_off(sb, type); } /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) * we don't have to be afraid of races */ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; struct buffer_head *bh; loff_t i_size = i_size_read(inode); if (off > i_size) return 0; if (off+len > i_size) len = i_size-off; toread = len; while (toread > 0) { tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); bh = ext4_bread(NULL, inode, blk, 0); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) /* A hole? */ memset(data, 0, tocopy); else memcpy(data, bh->b_data+offset, tocopy); brelse(bh); offset = 0; toread -= tocopy; data += tocopy; blk++; } return len; } /* Write to quotafile (we know the transaction is already started and has * enough credits) */ static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off) { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1); int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); if (!handle) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started", (unsigned long long)off, (unsigned long long)len); return -EIO; } /* * Since we account only one data block in transaction credits, * then it is impossible to cross a block boundary. */ if (sb->s_blocksize - offset < len) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because not block aligned", (unsigned long long)off, (unsigned long long)len); return -EIO; } do { bh = ext4_bread(handle, inode, blk, EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL); } while (PTR_ERR(bh) == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) goto out; BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) { brelse(bh); return err; } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); flush_dcache_page(bh->b_page); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; } return err ? err : len; } #endif #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static inline void register_as_ext2(void) { int err = register_filesystem(&ext2_fs_type); if (err) printk(KERN_WARNING "EXT4-fs: Unable to register as ext2 (%d)\n", err); } static inline void unregister_as_ext2(void) { unregister_filesystem(&ext2_fs_type); } static inline int ext2_feature_set_ok(struct super_block *sb) { if (ext4_has_unknown_ext2_incompat_features(sb)) return 0; if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext2_ro_compat_features(sb)) return 0; return 1; } #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; } #endif static inline void register_as_ext3(void) { int err = register_filesystem(&ext3_fs_type); if (err) printk(KERN_WARNING "EXT4-fs: Unable to register as ext3 (%d)\n", err); } static inline void unregister_as_ext3(void) { unregister_filesystem(&ext3_fs_type); } static inline int ext3_feature_set_ok(struct super_block *sb) { if (ext4_has_unknown_ext3_incompat_features(sb)) return 0; if (!ext4_has_feature_journal(sb)) return 0; if (sb_rdonly(sb)) return 1; if (ext4_has_unknown_ext3_ro_compat_features(sb)) return 0; return 1; } static void ext4_kill_sb(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL; kill_block_super(sb); if (bdev_file) bdev_fput(bdev_file); } static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("ext4"); /* Shared across all ext4 file systems */ wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; static int __init ext4_init_fs(void) { int i, err; ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; /* Build-time check for flags consistency */ ext4_check_flag_values(); for (i = 0; i < EXT4_WQ_HASH_SZ; i++) init_waitqueue_head(&ext4__ioend_wq[i]); err = ext4_init_es(); if (err) return err; err = ext4_init_pending(); if (err) goto out7; err = ext4_init_post_read_processing(); if (err) goto out6; err = ext4_init_pageio(); if (err) goto out5; err = ext4_init_system_zone(); if (err) goto out4; err = ext4_init_sysfs(); if (err) goto out3; err = ext4_init_mballoc(); if (err) goto out2; err = init_inodecache(); if (err) goto out1; err = ext4_fc_init_dentry_cache(); if (err) goto out05; register_as_ext3(); register_as_ext2(); err = register_filesystem(&ext4_fs_type); if (err) goto out; return 0; out: unregister_as_ext2(); unregister_as_ext3(); ext4_fc_destroy_dentry_cache(); out05: destroy_inodecache(); out1: ext4_exit_mballoc(); out2: ext4_exit_sysfs(); out3: ext4_exit_system_zone(); out4: ext4_exit_pageio(); out5: ext4_exit_post_read_processing(); out6: ext4_exit_pending(); out7: ext4_exit_es(); return err; } static void __exit ext4_exit_fs(void) { ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); unregister_filesystem(&ext4_fs_type); ext4_fc_destroy_dentry_cache(); destroy_inodecache(); ext4_exit_mballoc(); ext4_exit_sysfs(); ext4_exit_system_zone(); ext4_exit_pageio(); ext4_exit_post_read_processing(); ext4_exit_es(); ext4_exit_pending(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crc32c"); module_init(ext4_init_fs) module_exit(ext4_exit_fs) |
23 23 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 | /* * Copyright (c) Yann Collet, Facebook, Inc. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. */ #include "zstd_ldm.h" #include "../common/debug.h" #include <linux/xxhash.h> #include "zstd_fast.h" /* ZSTD_fillHashTable() */ #include "zstd_double_fast.h" /* ZSTD_fillDoubleHashTable() */ #include "zstd_ldm_geartab.h" #define LDM_BUCKET_SIZE_LOG 3 #define LDM_MIN_MATCH_LENGTH 64 #define LDM_HASH_RLOG 7 typedef struct { U64 rolling; U64 stopMask; } ldmRollingHashState_t; /* ZSTD_ldm_gear_init(): * * Initializes the rolling hash state such that it will honor the * settings in params. */ static void ZSTD_ldm_gear_init(ldmRollingHashState_t* state, ldmParams_t const* params) { unsigned maxBitsInMask = MIN(params->minMatchLength, 64); unsigned hashRateLog = params->hashRateLog; state->rolling = ~(U32)0; /* The choice of the splitting criterion is subject to two conditions: * 1. it has to trigger on average every 2^(hashRateLog) bytes; * 2. ideally, it has to depend on a window of minMatchLength bytes. * * In the gear hash algorithm, bit n depends on the last n bytes; * so in order to obtain a good quality splitting criterion it is * preferable to use bits with high weight. * * To match condition 1 we use a mask with hashRateLog bits set * and, because of the previous remark, we make sure these bits * have the highest possible weight while still respecting * condition 2. */ if (hashRateLog > 0 && hashRateLog <= maxBitsInMask) { state->stopMask = (((U64)1 << hashRateLog) - 1) << (maxBitsInMask - hashRateLog); } else { /* In this degenerate case we simply honor the hash rate. */ state->stopMask = ((U64)1 << hashRateLog) - 1; } } /* ZSTD_ldm_gear_reset() * Feeds [data, data + minMatchLength) into the hash without registering any * splits. This effectively resets the hash state. This is used when skipping * over data, either at the beginning of a block, or skipping sections. */ static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state, BYTE const* data, size_t minMatchLength) { U64 hash = state->rolling; size_t n = 0; #define GEAR_ITER_ONCE() do { \ hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \ n += 1; \ } while (0) while (n + 3 < minMatchLength) { GEAR_ITER_ONCE(); GEAR_ITER_ONCE(); GEAR_ITER_ONCE(); GEAR_ITER_ONCE(); } while (n < minMatchLength) { GEAR_ITER_ONCE(); } #undef GEAR_ITER_ONCE } /* ZSTD_ldm_gear_feed(): * * Registers in the splits array all the split points found in the first * size bytes following the data pointer. This function terminates when * either all the data has been processed or LDM_BATCH_SIZE splits are * present in the splits array. * * Precondition: The splits array must not be full. * Returns: The number of bytes processed. */ static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state, BYTE const* data, size_t size, size_t* splits, unsigned* numSplits) { size_t n; U64 hash, mask; hash = state->rolling; mask = state->stopMask; n = 0; #define GEAR_ITER_ONCE() do { \ hash = (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \ n += 1; \ if (UNLIKELY((hash & mask) == 0)) { \ splits[*numSplits] = n; \ *numSplits += 1; \ if (*numSplits == LDM_BATCH_SIZE) \ goto done; \ } \ } while (0) while (n + 3 < size) { GEAR_ITER_ONCE(); GEAR_ITER_ONCE(); GEAR_ITER_ONCE(); GEAR_ITER_ONCE(); } while (n < size) { GEAR_ITER_ONCE(); } #undef GEAR_ITER_ONCE done: state->rolling = hash; return n; } void ZSTD_ldm_adjustParameters(ldmParams_t* params, ZSTD_compressionParameters const* cParams) { params->windowLog = cParams->windowLog; ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX); DEBUGLOG(4, "ZSTD_ldm_adjustParameters"); if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG; if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH; if (params->hashLog == 0) { params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG); assert(params->hashLog <= ZSTD_HASHLOG_MAX); } if (params->hashRateLog == 0) { params->hashRateLog = params->windowLog < params->hashLog ? 0 : params->windowLog - params->hashLog; } params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog); } size_t ZSTD_ldm_getTableSize(ldmParams_t params) { size_t const ldmHSize = ((size_t)1) << params.hashLog; size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog); size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog); size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize) + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t)); return params.enableLdm == ZSTD_ps_enable ? totalSize : 0; } size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) { return params.enableLdm == ZSTD_ps_enable ? (maxChunkSize / params.minMatchLength) : 0; } /* ZSTD_ldm_getBucket() : * Returns a pointer to the start of the bucket associated with hash. */ static ldmEntry_t* ZSTD_ldm_getBucket( ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams) { return ldmState->hashTable + (hash << ldmParams.bucketSizeLog); } /* ZSTD_ldm_insertEntry() : * Insert the entry with corresponding hash into the hash table */ static void ZSTD_ldm_insertEntry(ldmState_t* ldmState, size_t const hash, const ldmEntry_t entry, ldmParams_t const ldmParams) { BYTE* const pOffset = ldmState->bucketOffsets + hash; unsigned const offset = *pOffset; *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + offset) = entry; *pOffset = (BYTE)((offset + 1) & ((1u << ldmParams.bucketSizeLog) - 1)); } /* ZSTD_ldm_countBackwardsMatch() : * Returns the number of bytes that match backwards before pIn and pMatch. * * We count only bytes where pMatch >= pBase and pIn >= pAnchor. */ static size_t ZSTD_ldm_countBackwardsMatch( const BYTE* pIn, const BYTE* pAnchor, const BYTE* pMatch, const BYTE* pMatchBase) { size_t matchLength = 0; while (pIn > pAnchor && pMatch > pMatchBase && pIn[-1] == pMatch[-1]) { pIn--; pMatch--; matchLength++; } return matchLength; } /* ZSTD_ldm_countBackwardsMatch_2segments() : * Returns the number of bytes that match backwards from pMatch, * even with the backwards match spanning 2 different segments. * * On reaching `pMatchBase`, start counting from mEnd */ static size_t ZSTD_ldm_countBackwardsMatch_2segments( const BYTE* pIn, const BYTE* pAnchor, const BYTE* pMatch, const BYTE* pMatchBase, const BYTE* pExtDictStart, const BYTE* pExtDictEnd) { size_t matchLength = ZSTD_ldm_countBackwardsMatch(pIn, pAnchor, pMatch, pMatchBase); if (pMatch - matchLength != pMatchBase || pMatchBase == pExtDictStart) { /* If backwards match is entirely in the extDict or prefix, immediately return */ return matchLength; } DEBUGLOG(7, "ZSTD_ldm_countBackwardsMatch_2segments: found 2-parts backwards match (length in prefix==%zu)", matchLength); matchLength += ZSTD_ldm_countBackwardsMatch(pIn - matchLength, pAnchor, pExtDictEnd, pExtDictStart); DEBUGLOG(7, "final backwards match length = %zu", matchLength); return matchLength; } /* ZSTD_ldm_fillFastTables() : * * Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies. * This is similar to ZSTD_loadDictionaryContent. * * The tables for the other strategies are filled within their * block compressors. */ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, void const* end) { const BYTE* const iend = (const BYTE*)end; switch(ms->cParams.strategy) { case ZSTD_fast: ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast); break; case ZSTD_dfast: ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast); break; case ZSTD_greedy: case ZSTD_lazy: case ZSTD_lazy2: case ZSTD_btlazy2: case ZSTD_btopt: case ZSTD_btultra: case ZSTD_btultra2: break; default: assert(0); /* not possible : not a valid strategy id */ } return 0; } void ZSTD_ldm_fillHashTable( ldmState_t* ldmState, const BYTE* ip, const BYTE* iend, ldmParams_t const* params) { U32 const minMatchLength = params->minMatchLength; U32 const hBits = params->hashLog - params->bucketSizeLog; BYTE const* const base = ldmState->window.base; BYTE const* const istart = ip; ldmRollingHashState_t hashState; size_t* const splits = ldmState->splitIndices; unsigned numSplits; DEBUGLOG(5, "ZSTD_ldm_fillHashTable"); ZSTD_ldm_gear_init(&hashState, params); while (ip < iend) { size_t hashed; unsigned n; numSplits = 0; hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits); for (n = 0; n < numSplits; n++) { if (ip + splits[n] >= istart + minMatchLength) { BYTE const* const split = ip + splits[n] - minMatchLength; U64 const xxhash = xxh64(split, minMatchLength, 0); U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1)); ldmEntry_t entry; entry.offset = (U32)(split - base); entry.checksum = (U32)(xxhash >> 32); ZSTD_ldm_insertEntry(ldmState, hash, entry, *params); } } ip += hashed; } } /* ZSTD_ldm_limitTableUpdate() : * * Sets cctx->nextToUpdate to a position corresponding closer to anchor * if it is far way * (after a long match, only update tables a limited amount). */ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) { U32 const curr = (U32)(anchor - ms->window.base); if (curr > ms->nextToUpdate + 1024) { ms->nextToUpdate = curr - MIN(512, curr - ms->nextToUpdate - 1024); } } static size_t ZSTD_ldm_generateSequences_internal( ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, ldmParams_t const* params, void const* src, size_t srcSize) { /* LDM parameters */ int const extDict = ZSTD_window_hasExtDict(ldmState->window); U32 const minMatchLength = params->minMatchLength; U32 const entsPerBucket = 1U << params->bucketSizeLog; U32 const hBits = params->hashLog - params->bucketSizeLog; /* Prefix and extDict parameters */ U32 const dictLimit = ldmState->window.dictLimit; U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit; BYTE const* const base = ldmState->window.base; BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL; BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL; BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL; BYTE const* const lowPrefixPtr = base + dictLimit; /* Input bounds */ BYTE const* const istart = (BYTE const*)src; BYTE const* const iend = istart + srcSize; BYTE const* const ilimit = iend - HASH_READ_SIZE; /* Input positions */ BYTE const* anchor = istart; BYTE const* ip = istart; /* Rolling hash state */ ldmRollingHashState_t hashState; /* Arrays for staged-processing */ size_t* const splits = ldmState->splitIndices; ldmMatchCandidate_t* const candidates = ldmState->matchCandidates; unsigned numSplits; if (srcSize < minMatchLength) return iend - anchor; /* Initialize the rolling hash state with the first minMatchLength bytes */ ZSTD_ldm_gear_init(&hashState, params); ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength); ip += minMatchLength; while (ip < ilimit) { size_t hashed; unsigned n; numSplits = 0; hashed = ZSTD_ldm_gear_feed(&hashState, ip, ilimit - ip, splits, &numSplits); for (n = 0; n < numSplits; n++) { BYTE const* const split = ip + splits[n] - minMatchLength; U64 const xxhash = xxh64(split, minMatchLength, 0); U32 const hash = (U32)(xxhash & (((U32)1 << hBits) - 1)); candidates[n].split = split; candidates[n].hash = hash; candidates[n].checksum = (U32)(xxhash >> 32); candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, *params); PREFETCH_L1(candidates[n].bucket); } for (n = 0; n < numSplits; n++) { size_t forwardMatchLength = 0, backwardMatchLength = 0, bestMatchLength = 0, mLength; U32 offset; BYTE const* const split = candidates[n].split; U32 const checksum = candidates[n].checksum; U32 const hash = candidates[n].hash; ldmEntry_t* const bucket = candidates[n].bucket; ldmEntry_t const* cur; ldmEntry_t const* bestEntry = NULL; ldmEntry_t newEntry; newEntry.offset = (U32)(split - base); newEntry.checksum = checksum; /* If a split point would generate a sequence overlapping with * the previous one, we merely register it in the hash table and * move on */ if (split < anchor) { ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); continue; } for (cur = bucket; cur < bucket + entsPerBucket; cur++) { size_t curForwardMatchLength, curBackwardMatchLength, curTotalMatchLength; if (cur->checksum != checksum || cur->offset <= lowestIndex) { continue; } if (extDict) { BYTE const* const curMatchBase = cur->offset < dictLimit ? dictBase : base; BYTE const* const pMatch = curMatchBase + cur->offset; BYTE const* const matchEnd = cur->offset < dictLimit ? dictEnd : iend; BYTE const* const lowMatchPtr = cur->offset < dictLimit ? dictStart : lowPrefixPtr; curForwardMatchLength = ZSTD_count_2segments(split, pMatch, iend, matchEnd, lowPrefixPtr); if (curForwardMatchLength < minMatchLength) { continue; } curBackwardMatchLength = ZSTD_ldm_countBackwardsMatch_2segments( split, anchor, pMatch, lowMatchPtr, dictStart, dictEnd); } else { /* !extDict */ BYTE const* const pMatch = base + cur->offset; curForwardMatchLength = ZSTD_count(split, pMatch, iend); if (curForwardMatchLength < minMatchLength) { continue; } curBackwardMatchLength = ZSTD_ldm_countBackwardsMatch(split, anchor, pMatch, lowPrefixPtr); } curTotalMatchLength = curForwardMatchLength + curBackwardMatchLength; if (curTotalMatchLength > bestMatchLength) { bestMatchLength = curTotalMatchLength; forwardMatchLength = curForwardMatchLength; backwardMatchLength = curBackwardMatchLength; bestEntry = cur; } } /* No match found -- insert an entry into the hash table * and process the next candidate match */ if (bestEntry == NULL) { ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); continue; } /* Match found */ offset = (U32)(split - base) - bestEntry->offset; mLength = forwardMatchLength + backwardMatchLength; { rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size; /* Out of sequence storage */ if (rawSeqStore->size == rawSeqStore->capacity) return ERROR(dstSize_tooSmall); seq->litLength = (U32)(split - backwardMatchLength - anchor); seq->matchLength = (U32)mLength; seq->offset = offset; rawSeqStore->size++; } /* Insert the current entry into the hash table --- it must be * done after the previous block to avoid clobbering bestEntry */ ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); anchor = split + forwardMatchLength; /* If we find a match that ends after the data that we've hashed * then we have a repeating, overlapping, pattern. E.g. all zeros. * If one repetition of the pattern matches our `stopMask` then all * repetitions will. We don't need to insert them all into out table, * only the first one. So skip over overlapping matches. * This is a major speed boost (20x) for compressing a single byte * repeated, when that byte ends up in the table. */ if (anchor > ip + hashed) { ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, minMatchLength); /* Continue the outer loop at anchor (ip + hashed == anchor). */ ip = anchor - hashed; break; } } ip += hashed; } return iend - anchor; } /*! ZSTD_ldm_reduceTable() : * reduce table indexes by `reducerValue` */ static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size, U32 const reducerValue) { U32 u; for (u = 0; u < size; u++) { if (table[u].offset < reducerValue) table[u].offset = 0; else table[u].offset -= reducerValue; } } size_t ZSTD_ldm_generateSequences( ldmState_t* ldmState, rawSeqStore_t* sequences, ldmParams_t const* params, void const* src, size_t srcSize) { U32 const maxDist = 1U << params->windowLog; BYTE const* const istart = (BYTE const*)src; BYTE const* const iend = istart + srcSize; size_t const kMaxChunkSize = 1 << 20; size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0); size_t chunk; size_t leftoverSize = 0; assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize); /* Check that ZSTD_window_update() has been called for this chunk prior * to passing it to this function. */ assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize); /* The input could be very large (in zstdmt), so it must be broken up into * chunks to enforce the maximum distance and handle overflow correction. */ assert(sequences->pos <= sequences->size); assert(sequences->size <= sequences->capacity); for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) { BYTE const* const chunkStart = istart + chunk * kMaxChunkSize; size_t const remaining = (size_t)(iend - chunkStart); BYTE const *const chunkEnd = (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize; size_t const chunkSize = chunkEnd - chunkStart; size_t newLeftoverSize; size_t const prevSize = sequences->size; assert(chunkStart < iend); /* 1. Perform overflow correction if necessary. */ if (ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDist, ldmState->loadedDictEnd, chunkStart, chunkEnd)) { U32 const ldmHSize = 1U << params->hashLog; U32 const correction = ZSTD_window_correctOverflow( &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart); ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction); /* invalidate dictionaries on overflow correction */ ldmState->loadedDictEnd = 0; } /* 2. We enforce the maximum offset allowed. * * kMaxChunkSize should be small enough that we don't lose too much of * the window through early invalidation. * TODO: * Test the chunk size. * * Try invalidation after the sequence generation and test the * the offset against maxDist directly. * * NOTE: Because of dictionaries + sequence splitting we MUST make sure * that any offset used is valid at the END of the sequence, since it may * be split into two sequences. This condition holds when using * ZSTD_window_enforceMaxDist(), but if we move to checking offsets * against maxDist directly, we'll have to carefully handle that case. */ ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, &ldmState->loadedDictEnd, NULL); /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */ newLeftoverSize = ZSTD_ldm_generateSequences_internal( ldmState, sequences, params, chunkStart, chunkSize); if (ZSTD_isError(newLeftoverSize)) return newLeftoverSize; /* 4. We add the leftover literals from previous iterations to the first * newly generated sequence, or add the `newLeftoverSize` if none are * generated. */ /* Prepend the leftover literals from the last call */ if (prevSize < sequences->size) { sequences->seq[prevSize].litLength += (U32)leftoverSize; leftoverSize = newLeftoverSize; } else { assert(newLeftoverSize == chunkSize); leftoverSize += chunkSize; } } return 0; } void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) { while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) { rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos; if (srcSize <= seq->litLength) { /* Skip past srcSize literals */ seq->litLength -= (U32)srcSize; return; } srcSize -= seq->litLength; seq->litLength = 0; if (srcSize < seq->matchLength) { /* Skip past the first srcSize of the match */ seq->matchLength -= (U32)srcSize; if (seq->matchLength < minMatch) { /* The match is too short, omit it */ if (rawSeqStore->pos + 1 < rawSeqStore->size) { seq[1].litLength += seq[0].matchLength; } rawSeqStore->pos++; } return; } srcSize -= seq->matchLength; seq->matchLength = 0; rawSeqStore->pos++; } } /* * If the sequence length is longer than remaining then the sequence is split * between this block and the next. * * Returns the current sequence to handle, or if the rest of the block should * be literals, it returns a sequence with offset == 0. */ static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore, U32 const remaining, U32 const minMatch) { rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos]; assert(sequence.offset > 0); /* Likely: No partial sequence */ if (remaining >= sequence.litLength + sequence.matchLength) { rawSeqStore->pos++; return sequence; } /* Cut the sequence short (offset == 0 ==> rest is literals). */ if (remaining <= sequence.litLength) { sequence.offset = 0; } else if (remaining < sequence.litLength + sequence.matchLength) { sequence.matchLength = remaining - sequence.litLength; if (sequence.matchLength < minMatch) { sequence.offset = 0; } } /* Skip past `remaining` bytes for the future sequences. */ ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch); return sequence; } void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) { U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); while (currPos && rawSeqStore->pos < rawSeqStore->size) { rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos]; if (currPos >= currSeq.litLength + currSeq.matchLength) { currPos -= currSeq.litLength + currSeq.matchLength; rawSeqStore->pos++; } else { rawSeqStore->posInSequence = currPos; break; } } if (currPos == 0 || rawSeqStore->pos == rawSeqStore->size) { rawSeqStore->posInSequence = 0; } } size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ZSTD_paramSwitch_e useRowMatchFinder, void const* src, size_t srcSize) { const ZSTD_compressionParameters* const cParams = &ms->cParams; unsigned const minMatch = cParams->minMatch; ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms)); /* Input bounds */ BYTE const* const istart = (BYTE const*)src; BYTE const* const iend = istart + srcSize; /* Input positions */ BYTE const* ip = istart; DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize); /* If using opt parser, use LDMs only as candidates rather than always accepting them */ if (cParams->strategy >= ZSTD_btopt) { size_t lastLLSize; ms->ldmSeqStore = rawSeqStore; lastLLSize = blockCompressor(ms, seqStore, rep, src, srcSize); ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore, srcSize); return lastLLSize; } assert(rawSeqStore->pos <= rawSeqStore->size); assert(rawSeqStore->size <= rawSeqStore->capacity); /* Loop through each sequence and apply the block compressor to the literals */ while (rawSeqStore->pos < rawSeqStore->size && ip < iend) { /* maybeSplitSequence updates rawSeqStore->pos */ rawSeq const sequence = maybeSplitSequence(rawSeqStore, (U32)(iend - ip), minMatch); int i; /* End signal */ if (sequence.offset == 0) break; assert(ip + sequence.litLength + sequence.matchLength <= iend); /* Fill tables for block compressor */ ZSTD_ldm_limitTableUpdate(ms, ip); ZSTD_ldm_fillFastTables(ms, ip); /* Run the block compressor */ DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); { size_t const newLitLength = blockCompressor(ms, seqStore, rep, ip, sequence.litLength); ip += sequence.litLength; /* Update the repcodes */ for (i = ZSTD_REP_NUM - 1; i > 0; i--) rep[i] = rep[i-1]; rep[0] = sequence.offset; /* Store the sequence */ ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, STORE_OFFSET(sequence.offset), sequence.matchLength); ip += sequence.matchLength; } } /* Fill the tables for the block compressor */ ZSTD_ldm_limitTableUpdate(ms, ip); ZSTD_ldm_fillFastTables(ms, ip); /* Compress the last literals */ return blockCompressor(ms, seqStore, rep, ip, iend - ip); } |
2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2000-2002 Vojtech Pavlik <vojtech@ucw.cz> * Copyright (c) 2001-2002, 2007 Johann Deneux <johann.deneux@gmail.com> * * USB/RS232 I-Force joysticks and wheels. */ #include <linux/unaligned.h> #include "iforce.h" MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>, Johann Deneux <johann.deneux@gmail.com>"); MODULE_DESCRIPTION("Core I-Force joysticks and wheels driver"); MODULE_LICENSE("GPL"); static signed short btn_joystick[] = { BTN_TRIGGER, BTN_TOP, BTN_THUMB, BTN_TOP2, BTN_BASE, BTN_BASE2, BTN_BASE3, BTN_BASE4, BTN_BASE5, BTN_A, BTN_B, BTN_C, BTN_DEAD, -1 }; static signed short btn_joystick_avb[] = { BTN_TRIGGER, BTN_THUMB, BTN_TOP, BTN_TOP2, BTN_BASE, BTN_BASE2, BTN_BASE3, BTN_BASE4, BTN_DEAD, -1 }; static signed short btn_wheel[] = { BTN_GEAR_DOWN, BTN_GEAR_UP, BTN_BASE, BTN_BASE2, BTN_BASE3, BTN_BASE4, BTN_BASE5, BTN_BASE6, -1 }; static signed short abs_joystick[] = { ABS_X, ABS_Y, ABS_THROTTLE, ABS_HAT0X, ABS_HAT0Y, -1 }; static signed short abs_joystick_rudder[] = { ABS_X, ABS_Y, ABS_THROTTLE, ABS_RUDDER, ABS_HAT0X, ABS_HAT0Y, -1 }; static signed short abs_avb_pegasus[] = { ABS_X, ABS_Y, ABS_THROTTLE, ABS_RUDDER, ABS_HAT0X, ABS_HAT0Y, ABS_HAT1X, ABS_HAT1Y, -1 }; static signed short abs_wheel[] = { ABS_WHEEL, ABS_GAS, ABS_BRAKE, ABS_HAT0X, ABS_HAT0Y, -1 }; static signed short ff_iforce[] = { FF_PERIODIC, FF_CONSTANT, FF_SPRING, FF_DAMPER, FF_SQUARE, FF_TRIANGLE, FF_SINE, FF_SAW_UP, FF_SAW_DOWN, FF_GAIN, FF_AUTOCENTER, -1 }; static struct iforce_device iforce_device[] = { { 0x044f, 0xa01c, "Thrustmaster Motor Sport GT", btn_wheel, abs_wheel, ff_iforce }, { 0x046d, 0xc281, "Logitech WingMan Force", btn_joystick, abs_joystick, ff_iforce }, { 0x046d, 0xc291, "Logitech WingMan Formula Force", btn_wheel, abs_wheel, ff_iforce }, { 0x05ef, 0x020a, "AVB Top Shot Pegasus", btn_joystick_avb, abs_avb_pegasus, ff_iforce }, { 0x05ef, 0x8884, "AVB Mag Turbo Force", btn_wheel, abs_wheel, ff_iforce }, { 0x05ef, 0x8886, "Boeder Force Feedback Wheel", btn_wheel, abs_wheel, ff_iforce }, { 0x05ef, 0x8888, "AVB Top Shot Force Feedback Racing Wheel", btn_wheel, abs_wheel, ff_iforce }, //? { 0x061c, 0xc0a4, "ACT LABS Force RS", btn_wheel, abs_wheel, ff_iforce }, //? { 0x061c, 0xc084, "ACT LABS Force RS", btn_wheel, abs_wheel, ff_iforce }, { 0x06a3, 0xff04, "Saitek R440 Force Wheel", btn_wheel, abs_wheel, ff_iforce }, //? { 0x06f8, 0x0001, "Guillemot Race Leader Force Feedback", btn_wheel, abs_wheel, ff_iforce }, //? { 0x06f8, 0x0001, "Guillemot Jet Leader Force Feedback", btn_joystick, abs_joystick_rudder, ff_iforce }, { 0x06f8, 0x0004, "Guillemot Force Feedback Racing Wheel", btn_wheel, abs_wheel, ff_iforce }, //? { 0x06f8, 0xa302, "Guillemot Jet Leader 3D", btn_joystick, abs_joystick, ff_iforce }, //? { 0x06d6, 0x29bc, "Trust Force Feedback Race Master", btn_wheel, abs_wheel, ff_iforce }, { 0x0000, 0x0000, "Unknown I-Force Device [%04x:%04x]", btn_joystick, abs_joystick, ff_iforce } }; static int iforce_playback(struct input_dev *dev, int effect_id, int value) { struct iforce *iforce = input_get_drvdata(dev); struct iforce_core_effect *core_effect = &iforce->core_effects[effect_id]; if (value > 0) set_bit(FF_CORE_SHOULD_PLAY, core_effect->flags); else clear_bit(FF_CORE_SHOULD_PLAY, core_effect->flags); iforce_control_playback(iforce, effect_id, value); return 0; } static void iforce_set_gain(struct input_dev *dev, u16 gain) { struct iforce *iforce = input_get_drvdata(dev); unsigned char data[3]; data[0] = gain >> 9; iforce_send_packet(iforce, FF_CMD_GAIN, data); } static void iforce_set_autocenter(struct input_dev *dev, u16 magnitude) { struct iforce *iforce = input_get_drvdata(dev); unsigned char data[3]; data[0] = 0x03; data[1] = magnitude >> 9; iforce_send_packet(iforce, FF_CMD_AUTOCENTER, data); data[0] = 0x04; data[1] = 0x01; iforce_send_packet(iforce, FF_CMD_AUTOCENTER, data); } /* * Function called when an ioctl is performed on the event dev entry. * It uploads an effect to the device */ static int iforce_upload_effect(struct input_dev *dev, struct ff_effect *effect, struct ff_effect *old) { struct iforce *iforce = input_get_drvdata(dev); struct iforce_core_effect *core_effect = &iforce->core_effects[effect->id]; int ret; if (__test_and_set_bit(FF_CORE_IS_USED, core_effect->flags)) { /* Check the effect is not already being updated */ if (test_bit(FF_CORE_UPDATE, core_effect->flags)) return -EAGAIN; } /* * Upload the effect */ switch (effect->type) { case FF_PERIODIC: ret = iforce_upload_periodic(iforce, effect, old); break; case FF_CONSTANT: ret = iforce_upload_constant(iforce, effect, old); break; case FF_SPRING: case FF_DAMPER: ret = iforce_upload_condition(iforce, effect, old); break; default: return -EINVAL; } if (ret == 0) { /* A packet was sent, forbid new updates until we are notified * that the packet was updated */ set_bit(FF_CORE_UPDATE, core_effect->flags); } return ret; } /* * Erases an effect: it frees the effect id and mark as unused the memory * allocated for the parameters */ static int iforce_erase_effect(struct input_dev *dev, int effect_id) { struct iforce *iforce = input_get_drvdata(dev); struct iforce_core_effect *core_effect = &iforce->core_effects[effect_id]; int err = 0; if (test_bit(FF_MOD1_IS_USED, core_effect->flags)) err = release_resource(&core_effect->mod1_chunk); if (!err && test_bit(FF_MOD2_IS_USED, core_effect->flags)) err = release_resource(&core_effect->mod2_chunk); /* TODO: remember to change that if more FF_MOD* bits are added */ core_effect->flags[0] = 0; return err; } static int iforce_open(struct input_dev *dev) { struct iforce *iforce = input_get_drvdata(dev); iforce->xport_ops->start_io(iforce); if (test_bit(EV_FF, dev->evbit)) { /* Enable force feedback */ iforce_send_packet(iforce, FF_CMD_ENABLE, "\004"); } return 0; } static void iforce_close(struct input_dev *dev) { struct iforce *iforce = input_get_drvdata(dev); int i; if (test_bit(EV_FF, dev->evbit)) { /* Check: no effects should be present in memory */ for (i = 0; i < dev->ff->max_effects; i++) { if (test_bit(FF_CORE_IS_USED, iforce->core_effects[i].flags)) { dev_warn(&dev->dev, "%s: Device still owns effects\n", __func__); break; } } /* Disable force feedback playback */ iforce_send_packet(iforce, FF_CMD_ENABLE, "\001"); /* Wait for the command to complete */ wait_event_interruptible(iforce->wait, !test_bit(IFORCE_XMIT_RUNNING, iforce->xmit_flags)); } iforce->xport_ops->stop_io(iforce); } int iforce_init_device(struct device *parent, u16 bustype, struct iforce *iforce) { struct input_dev *input_dev; struct ff_device *ff; u8 c[] = "CEOV"; u8 buf[IFORCE_MAX_LENGTH]; size_t len; int i, error; int ff_effects = 0; input_dev = input_allocate_device(); if (!input_dev) return -ENOMEM; init_waitqueue_head(&iforce->wait); spin_lock_init(&iforce->xmit_lock); mutex_init(&iforce->mem_mutex); iforce->xmit.buf = iforce->xmit_data; iforce->dev = input_dev; /* * Input device fields. */ input_dev->id.bustype = bustype; input_dev->dev.parent = parent; input_set_drvdata(input_dev, iforce); input_dev->name = "Unknown I-Force device"; input_dev->open = iforce_open; input_dev->close = iforce_close; /* * On-device memory allocation. */ iforce->device_memory.name = "I-Force device effect memory"; iforce->device_memory.start = 0; iforce->device_memory.end = 200; iforce->device_memory.flags = IORESOURCE_MEM; iforce->device_memory.parent = NULL; iforce->device_memory.child = NULL; iforce->device_memory.sibling = NULL; /* * Wait until device ready - until it sends its first response. */ for (i = 0; i < 20; i++) if (!iforce_get_id_packet(iforce, 'O', buf, &len)) break; if (i == 20) { /* 5 seconds */ dev_err(&input_dev->dev, "Timeout waiting for response from device.\n"); error = -ENODEV; goto fail; } /* * Get device info. */ if (!iforce_get_id_packet(iforce, 'M', buf, &len) && len >= 3) input_dev->id.vendor = get_unaligned_le16(buf + 1); else dev_warn(&iforce->dev->dev, "Device does not respond to id packet M\n"); if (!iforce_get_id_packet(iforce, 'P', buf, &len) && len >= 3) input_dev->id.product = get_unaligned_le16(buf + 1); else dev_warn(&iforce->dev->dev, "Device does not respond to id packet P\n"); if (!iforce_get_id_packet(iforce, 'B', buf, &len) && len >= 3) iforce->device_memory.end = get_unaligned_le16(buf + 1); else dev_warn(&iforce->dev->dev, "Device does not respond to id packet B\n"); if (!iforce_get_id_packet(iforce, 'N', buf, &len) && len >= 2) ff_effects = buf[1]; else dev_warn(&iforce->dev->dev, "Device does not respond to id packet N\n"); /* Check if the device can store more effects than the driver can really handle */ if (ff_effects > IFORCE_EFFECTS_MAX) { dev_warn(&iforce->dev->dev, "Limiting number of effects to %d (device reports %d)\n", IFORCE_EFFECTS_MAX, ff_effects); ff_effects = IFORCE_EFFECTS_MAX; } /* * Display additional info. */ for (i = 0; c[i]; i++) if (!iforce_get_id_packet(iforce, c[i], buf, &len)) iforce_dump_packet(iforce, "info", (FF_CMD_QUERY & 0xff00) | len, buf); /* * Disable spring, enable force feedback. */ iforce_set_autocenter(input_dev, 0); /* * Find appropriate device entry */ for (i = 0; iforce_device[i].idvendor; i++) if (iforce_device[i].idvendor == input_dev->id.vendor && iforce_device[i].idproduct == input_dev->id.product) break; iforce->type = iforce_device + i; input_dev->name = iforce->type->name; /* * Set input device bitfields and ranges. */ input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) | BIT_MASK(EV_FF_STATUS); for (i = 0; iforce->type->btn[i] >= 0; i++) set_bit(iforce->type->btn[i], input_dev->keybit); for (i = 0; iforce->type->abs[i] >= 0; i++) { signed short t = iforce->type->abs[i]; switch (t) { case ABS_X: case ABS_Y: case ABS_WHEEL: input_set_abs_params(input_dev, t, -1920, 1920, 16, 128); set_bit(t, input_dev->ffbit); break; case ABS_THROTTLE: case ABS_GAS: case ABS_BRAKE: input_set_abs_params(input_dev, t, 0, 255, 0, 0); break; case ABS_RUDDER: input_set_abs_params(input_dev, t, -128, 127, 0, 0); break; case ABS_HAT0X: case ABS_HAT0Y: case ABS_HAT1X: case ABS_HAT1Y: input_set_abs_params(input_dev, t, -1, 1, 0, 0); break; } } if (ff_effects) { for (i = 0; iforce->type->ff[i] >= 0; i++) set_bit(iforce->type->ff[i], input_dev->ffbit); error = input_ff_create(input_dev, ff_effects); if (error) goto fail; ff = input_dev->ff; ff->upload = iforce_upload_effect; ff->erase = iforce_erase_effect; ff->set_gain = iforce_set_gain; ff->set_autocenter = iforce_set_autocenter; ff->playback = iforce_playback; } /* * Register input device. */ error = input_register_device(iforce->dev); if (error) goto fail; return 0; fail: input_free_device(input_dev); return error; } EXPORT_SYMBOL(iforce_init_device); |
1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" #include "bitset.h" struct wol_req_info { struct ethnl_req_info base; }; struct wol_reply_data { struct ethnl_reply_data base; struct ethtool_wolinfo wol; bool show_sopass; }; #define WOL_REPDATA(__reply_base) \ container_of(__reply_base, struct wol_reply_data, base) const struct nla_policy ethnl_wol_get_policy[] = { [ETHTOOL_A_WOL_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int wol_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct wol_reply_data *data = WOL_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_wol) return -EOPNOTSUPP; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_wol(dev, &data->wol); ethnl_ops_complete(dev); /* do not include password in notifications */ data->show_sopass = !genl_info_is_ntf(info) && (data->wol.supported & WAKE_MAGICSECURE); return 0; } static int wol_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct wol_reply_data *data = WOL_REPDATA(reply_base); int len; len = ethnl_bitset32_size(&data->wol.wolopts, &data->wol.supported, WOL_MODE_COUNT, wol_mode_names, compact); if (len < 0) return len; if (data->show_sopass) len += nla_total_size(sizeof(data->wol.sopass)); return len; } static int wol_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct wol_reply_data *data = WOL_REPDATA(reply_base); int ret; ret = ethnl_put_bitset32(skb, ETHTOOL_A_WOL_MODES, &data->wol.wolopts, &data->wol.supported, WOL_MODE_COUNT, wol_mode_names, compact); if (ret < 0) return ret; if (data->show_sopass && nla_put(skb, ETHTOOL_A_WOL_SOPASS, sizeof(data->wol.sopass), data->wol.sopass)) return -EMSGSIZE; return 0; } /* WOL_SET */ const struct nla_policy ethnl_wol_set_policy[] = { [ETHTOOL_A_WOL_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_WOL_MODES] = { .type = NLA_NESTED }, [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_BINARY, .len = SOPASS_MAX }, }; static int ethnl_set_wol_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_wol && ops->set_wol ? 1 : -EOPNOTSUPP; } static int ethnl_set_wol(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL }; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; int ret; dev->ethtool_ops->get_wol(dev, &wol); ret = ethnl_update_bitset32(&wol.wolopts, WOL_MODE_COUNT, tb[ETHTOOL_A_WOL_MODES], wol_mode_names, info->extack, &mod); if (ret < 0) return ret; if (wol.wolopts & ~wol.supported) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_MODES], "cannot enable unsupported WoL mode"); return -EINVAL; } if (tb[ETHTOOL_A_WOL_SOPASS]) { if (!(wol.supported & WAKE_MAGICSECURE)) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_SOPASS], "magicsecure not supported, cannot set password"); return -EINVAL; } ethnl_update_binary(wol.sopass, sizeof(wol.sopass), tb[ETHTOOL_A_WOL_SOPASS], &mod); } if (!mod) return 0; ret = dev->ethtool_ops->set_wol(dev, &wol); if (ret) return ret; dev->ethtool->wol_enabled = !!wol.wolopts; return 1; } const struct ethnl_request_ops ethnl_wol_request_ops = { .request_cmd = ETHTOOL_MSG_WOL_GET, .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY, .hdr_attr = ETHTOOL_A_WOL_HEADER, .req_info_size = sizeof(struct wol_req_info), .reply_data_size = sizeof(struct wol_reply_data), .prepare_data = wol_prepare_data, .reply_size = wol_reply_size, .fill_reply = wol_fill_reply, .set_validate = ethnl_set_wol_validate, .set = ethnl_set_wol, .set_ntf_cmd = ETHTOOL_MSG_WOL_NTF, }; |
16 16 16 12 12 48 48 45 5 48 8 1 1 1 5 5 1 1 98 98 2 2 98 97 43 44 2 51 51 48 4 2 1 1 53 1 1 41 4 11 2 2 2 51 6 45 99 99 79 17 17 56 14 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 | // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 MLME SAP interface * * Copyright (c) 2009, Jouni Malinen <j@w1.fi> * Copyright (c) 2015 Intel Deutschland GmbH * Copyright (C) 2019-2020, 2022-2024 Intel Corporation */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/nl80211.h> #include <linux/slab.h> #include <linux/wireless.h> #include <net/cfg80211.h> #include <net/iw_handler.h> #include "core.h" #include "nl80211.h" #include "rdev-ops.h" void cfg80211_rx_assoc_resp(struct net_device *dev, const struct cfg80211_rx_assoc_resp_data *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)data->buf; struct cfg80211_connect_resp_params cr = { .timeout_reason = NL80211_TIMEOUT_UNSPECIFIED, .req_ie = data->req_ies, .req_ie_len = data->req_ies_len, .resp_ie = mgmt->u.assoc_resp.variable, .resp_ie_len = data->len - offsetof(struct ieee80211_mgmt, u.assoc_resp.variable), .status = le16_to_cpu(mgmt->u.assoc_resp.status_code), .ap_mld_addr = data->ap_mld_addr, }; unsigned int link_id; for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { cr.links[link_id].status = data->links[link_id].status; cr.links[link_id].bss = data->links[link_id].bss; WARN_ON_ONCE(cr.links[link_id].status != WLAN_STATUS_SUCCESS && (!cr.ap_mld_addr || !cr.links[link_id].bss)); if (!cr.links[link_id].bss) continue; cr.links[link_id].bssid = data->links[link_id].bss->bssid; cr.links[link_id].addr = data->links[link_id].addr; /* need to have local link addresses for MLO connections */ WARN_ON(cr.ap_mld_addr && !is_valid_ether_addr(cr.links[link_id].addr)); BUG_ON(!cr.links[link_id].bss->channel); if (cr.links[link_id].bss->channel->band == NL80211_BAND_S1GHZ) { WARN_ON(link_id); cr.resp_ie = (u8 *)&mgmt->u.s1g_assoc_resp.variable; cr.resp_ie_len = data->len - offsetof(struct ieee80211_mgmt, u.s1g_assoc_resp.variable); } if (cr.ap_mld_addr) cr.valid_links |= BIT(link_id); } trace_cfg80211_send_rx_assoc(dev, data); /* * This is a bit of a hack, we don't notify userspace of * a (re-)association reply if we tried to send a reassoc * and got a reject -- we only try again with an assoc * frame instead of reassoc. */ if (cfg80211_sme_rx_assoc_resp(wdev, cr.status)) { for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { struct cfg80211_bss *bss = data->links[link_id].bss; if (!bss) continue; cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wiphy, bss); } return; } nl80211_send_rx_assoc(rdev, dev, data); /* update current_bss etc., consumes the bss reference */ __cfg80211_connect_result(dev, &cr, cr.status == WLAN_STATUS_SUCCESS); } EXPORT_SYMBOL(cfg80211_rx_assoc_resp); static void cfg80211_process_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); nl80211_send_rx_auth(rdev, wdev->netdev, buf, len, GFP_KERNEL); cfg80211_sme_rx_auth(wdev, buf, len); } static void cfg80211_process_deauth(struct wireless_dev *wdev, const u8 *buf, size_t len, bool reconnect) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; const u8 *bssid = mgmt->bssid; u16 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr); nl80211_send_deauth(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL); if (!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid)) return; __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap); cfg80211_sme_deauth(wdev); } static void cfg80211_process_disassoc(struct wireless_dev *wdev, const u8 *buf, size_t len, bool reconnect) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; const u8 *bssid = mgmt->bssid; u16 reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); bool from_ap = !ether_addr_equal(mgmt->sa, wdev->netdev->dev_addr); nl80211_send_disassoc(rdev, wdev->netdev, buf, len, reconnect, GFP_KERNEL); if (WARN_ON(!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid))) return; __cfg80211_disconnected(wdev->netdev, NULL, 0, reason_code, from_ap); cfg80211_sme_disassoc(wdev); } void cfg80211_rx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_mgmt *mgmt = (void *)buf; lockdep_assert_wiphy(wdev->wiphy); trace_cfg80211_rx_mlme_mgmt(dev, buf, len); if (WARN_ON(len < 2)) return; if (ieee80211_is_auth(mgmt->frame_control)) cfg80211_process_auth(wdev, buf, len); else if (ieee80211_is_deauth(mgmt->frame_control)) cfg80211_process_deauth(wdev, buf, len, false); else if (ieee80211_is_disassoc(mgmt->frame_control)) cfg80211_process_disassoc(wdev, buf, len, false); } EXPORT_SYMBOL(cfg80211_rx_mlme_mgmt); void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_send_auth_timeout(dev, addr); nl80211_send_auth_timeout(rdev, dev, addr, GFP_KERNEL); cfg80211_sme_auth_timeout(wdev); } EXPORT_SYMBOL(cfg80211_auth_timeout); void cfg80211_assoc_failure(struct net_device *dev, struct cfg80211_assoc_failure *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); const u8 *addr = data->ap_mld_addr ?: data->bss[0]->bssid; int i; trace_cfg80211_send_assoc_failure(dev, data); if (data->timeout) { nl80211_send_assoc_timeout(rdev, dev, addr, GFP_KERNEL); cfg80211_sme_assoc_timeout(wdev); } else { cfg80211_sme_abandon_assoc(wdev); } for (i = 0; i < ARRAY_SIZE(data->bss); i++) { struct cfg80211_bss *bss = data->bss[i]; if (!bss) continue; cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wiphy, bss); } } EXPORT_SYMBOL(cfg80211_assoc_failure); void cfg80211_tx_mlme_mgmt(struct net_device *dev, const u8 *buf, size_t len, bool reconnect) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_mgmt *mgmt = (void *)buf; lockdep_assert_wiphy(wdev->wiphy); trace_cfg80211_tx_mlme_mgmt(dev, buf, len, reconnect); if (WARN_ON(len < 2)) return; if (ieee80211_is_deauth(mgmt->frame_control)) cfg80211_process_deauth(wdev, buf, len, reconnect); else cfg80211_process_disassoc(wdev, buf, len, reconnect); } EXPORT_SYMBOL(cfg80211_tx_mlme_mgmt); void cfg80211_michael_mic_failure(struct net_device *dev, const u8 *addr, enum nl80211_key_type key_type, int key_id, const u8 *tsc, gfp_t gfp) { struct wiphy *wiphy = dev->ieee80211_ptr->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; char *buf = kmalloc(128, gfp); if (buf) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = sprintf(buf, "MLME-MICHAELMICFAILURE." "indication(keyid=%d %scast addr=%pM)", key_id, key_type == NL80211_KEYTYPE_GROUP ? "broad" : "uni", addr); wireless_send_event(dev, IWEVCUSTOM, &wrqu, buf); kfree(buf); } #endif trace_cfg80211_michael_mic_failure(dev, addr, key_type, key_id, tsc); nl80211_michael_mic_failure(rdev, dev, addr, key_type, key_id, tsc, gfp); } EXPORT_SYMBOL(cfg80211_michael_mic_failure); /* some MLME handling for userspace SME */ int cfg80211_mlme_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_auth_request *req) { struct wireless_dev *wdev = dev->ieee80211_ptr; lockdep_assert_wiphy(wdev->wiphy); if (!req->bss) return -ENOENT; if (req->link_id >= 0 && !(wdev->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO)) return -EINVAL; if (req->auth_type == NL80211_AUTHTYPE_SHARED_KEY) { if (!req->key || !req->key_len || req->key_idx < 0 || req->key_idx > 3) return -EINVAL; } if (wdev->connected && ether_addr_equal(req->bss->bssid, wdev->u.client.connected_addr)) return -EALREADY; if (ether_addr_equal(req->bss->bssid, dev->dev_addr) || (req->link_id >= 0 && ether_addr_equal(req->ap_mld_addr, dev->dev_addr))) return -EINVAL; return rdev_auth(rdev, dev, req); } /* Do a logical ht_capa &= ht_capa_mask. */ void cfg80211_oper_and_ht_capa(struct ieee80211_ht_cap *ht_capa, const struct ieee80211_ht_cap *ht_capa_mask) { int i; u8 *p1, *p2; if (!ht_capa_mask) { memset(ht_capa, 0, sizeof(*ht_capa)); return; } p1 = (u8*)(ht_capa); p2 = (u8*)(ht_capa_mask); for (i = 0; i < sizeof(*ht_capa); i++) p1[i] &= p2[i]; } /* Do a logical vht_capa &= vht_capa_mask. */ void cfg80211_oper_and_vht_capa(struct ieee80211_vht_cap *vht_capa, const struct ieee80211_vht_cap *vht_capa_mask) { int i; u8 *p1, *p2; if (!vht_capa_mask) { memset(vht_capa, 0, sizeof(*vht_capa)); return; } p1 = (u8*)(vht_capa); p2 = (u8*)(vht_capa_mask); for (i = 0; i < sizeof(*vht_capa); i++) p1[i] &= p2[i]; } static int cfg80211_mlme_check_mlo_compat(const struct ieee80211_multi_link_elem *mle_a, const struct ieee80211_multi_link_elem *mle_b, struct netlink_ext_ack *extack) { const struct ieee80211_mle_basic_common_info *common_a, *common_b; common_a = (const void *)mle_a->variable; common_b = (const void *)mle_b->variable; if (memcmp(common_a->mld_mac_addr, common_b->mld_mac_addr, ETH_ALEN)) { NL_SET_ERR_MSG(extack, "AP MLD address mismatch"); return -EINVAL; } if (ieee80211_mle_get_eml_cap((const u8 *)mle_a) != ieee80211_mle_get_eml_cap((const u8 *)mle_b)) { NL_SET_ERR_MSG(extack, "link EML capabilities mismatch"); return -EINVAL; } if (ieee80211_mle_get_mld_capa_op((const u8 *)mle_a) != ieee80211_mle_get_mld_capa_op((const u8 *)mle_b)) { NL_SET_ERR_MSG(extack, "link MLD capabilities/ops mismatch"); return -EINVAL; } return 0; } static int cfg80211_mlme_check_mlo(struct net_device *dev, struct cfg80211_assoc_request *req, struct netlink_ext_ack *extack) { const struct ieee80211_multi_link_elem *mles[ARRAY_SIZE(req->links)] = {}; int i; if (req->link_id < 0) return 0; if (!req->links[req->link_id].bss) { NL_SET_ERR_MSG(extack, "no BSS for assoc link"); return -EINVAL; } rcu_read_lock(); for (i = 0; i < ARRAY_SIZE(req->links); i++) { const struct cfg80211_bss_ies *ies; const struct element *ml; if (!req->links[i].bss) continue; if (ether_addr_equal(req->links[i].bss->bssid, dev->dev_addr)) { NL_SET_ERR_MSG(extack, "BSSID must not be our address"); req->links[i].error = -EINVAL; goto error; } ies = rcu_dereference(req->links[i].bss->ies); ml = cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK, ies->data, ies->len); if (!ml) { NL_SET_ERR_MSG(extack, "MLO BSS w/o ML element"); req->links[i].error = -EINVAL; goto error; } if (!ieee80211_mle_type_ok(ml->data + 1, IEEE80211_ML_CONTROL_TYPE_BASIC, ml->datalen - 1)) { NL_SET_ERR_MSG(extack, "BSS with invalid ML element"); req->links[i].error = -EINVAL; goto error; } mles[i] = (const void *)(ml->data + 1); if (ieee80211_mle_get_link_id((const u8 *)mles[i]) != i) { NL_SET_ERR_MSG(extack, "link ID mismatch"); req->links[i].error = -EINVAL; goto error; } } if (WARN_ON(!mles[req->link_id])) goto error; for (i = 0; i < ARRAY_SIZE(req->links); i++) { if (i == req->link_id || !req->links[i].bss) continue; if (WARN_ON(!mles[i])) goto error; if (cfg80211_mlme_check_mlo_compat(mles[req->link_id], mles[i], extack)) { req->links[i].error = -EINVAL; goto error; } } rcu_read_unlock(); return 0; error: rcu_read_unlock(); return -EINVAL; } /* Note: caller must cfg80211_put_bss() regardless of result */ int cfg80211_mlme_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req, struct netlink_ext_ack *extack) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); err = cfg80211_mlme_check_mlo(dev, req, extack); if (err) return err; if (wdev->connected && (!req->prev_bssid || !ether_addr_equal(wdev->u.client.connected_addr, req->prev_bssid))) return -EALREADY; if ((req->bss && ether_addr_equal(req->bss->bssid, dev->dev_addr)) || (req->link_id >= 0 && ether_addr_equal(req->ap_mld_addr, dev->dev_addr))) return -EINVAL; cfg80211_oper_and_ht_capa(&req->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); cfg80211_oper_and_vht_capa(&req->vht_capa_mask, rdev->wiphy.vht_capa_mod_mask); err = rdev_assoc(rdev, dev, req); if (!err) { int link_id; if (req->bss) { cfg80211_ref_bss(&rdev->wiphy, req->bss); cfg80211_hold_bss(bss_from_pub(req->bss)); } for (link_id = 0; link_id < ARRAY_SIZE(req->links); link_id++) { if (!req->links[link_id].bss) continue; cfg80211_ref_bss(&rdev->wiphy, req->links[link_id].bss); cfg80211_hold_bss(bss_from_pub(req->links[link_id].bss)); } } return err; } int cfg80211_mlme_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *bssid, const u8 *ie, int ie_len, u16 reason, bool local_state_change) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_deauth_request req = { .bssid = bssid, .reason_code = reason, .ie = ie, .ie_len = ie_len, .local_state_change = local_state_change, }; lockdep_assert_wiphy(wdev->wiphy); if (local_state_change && (!wdev->connected || !ether_addr_equal(wdev->u.client.connected_addr, bssid))) return 0; if (ether_addr_equal(wdev->disconnect_bssid, bssid) || (wdev->connected && ether_addr_equal(wdev->u.client.connected_addr, bssid))) wdev->conn_owner_nlportid = 0; return rdev_deauth(rdev, dev, &req); } int cfg80211_mlme_disassoc(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *ap_addr, const u8 *ie, int ie_len, u16 reason, bool local_state_change) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_disassoc_request req = { .reason_code = reason, .local_state_change = local_state_change, .ie = ie, .ie_len = ie_len, .ap_addr = ap_addr, }; int err; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->connected) return -ENOTCONN; if (memcmp(wdev->u.client.connected_addr, ap_addr, ETH_ALEN)) return -ENOTCONN; err = rdev_disassoc(rdev, dev, &req); if (err) return err; /* driver should have reported the disassoc */ WARN_ON(wdev->connected); return 0; } void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; u8 bssid[ETH_ALEN]; lockdep_assert_wiphy(wdev->wiphy); if (!rdev->ops->deauth) return; if (!wdev->connected) return; memcpy(bssid, wdev->u.client.connected_addr, ETH_ALEN); cfg80211_mlme_deauth(rdev, dev, bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); } struct cfg80211_mgmt_registration { struct list_head list; struct wireless_dev *wdev; u32 nlportid; int match_len; __le16 frame_type; bool multicast_rx; u8 match[]; }; static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct wireless_dev *tmp; struct cfg80211_mgmt_registration *reg; struct mgmt_frame_regs upd = {}; lockdep_assert_held(&rdev->wiphy.mtx); spin_lock_bh(&rdev->mgmt_registrations_lock); if (!wdev->mgmt_registrations_need_update) { spin_unlock_bh(&rdev->mgmt_registrations_lock); return; } rcu_read_lock(); list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) { list_for_each_entry(reg, &tmp->mgmt_registrations, list) { u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4); u32 mcast_mask = 0; if (reg->multicast_rx) mcast_mask = mask; upd.global_stypes |= mask; upd.global_mcast_stypes |= mcast_mask; if (tmp == wdev) { upd.interface_stypes |= mask; upd.interface_mcast_stypes |= mcast_mask; } } } rcu_read_unlock(); wdev->mgmt_registrations_need_update = 0; spin_unlock_bh(&rdev->mgmt_registrations_lock); rdev_update_mgmt_frame_registrations(rdev, wdev, &upd); } void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; rdev = container_of(wk, struct cfg80211_registered_device, mgmt_registrations_update_wk); wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_mgmt_registrations_update(wdev); wiphy_unlock(&rdev->wiphy); } int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, u16 frame_type, const u8 *match_data, int match_len, bool multicast_rx, struct netlink_ext_ack *extack) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; bool update_multicast = false; if (!wdev->wiphy->mgmt_stypes) return -EOPNOTSUPP; if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT) { NL_SET_ERR_MSG(extack, "frame type not management"); return -EINVAL; } if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) { NL_SET_ERR_MSG(extack, "Invalid frame type"); return -EINVAL; } mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4; if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type))) { NL_SET_ERR_MSG(extack, "Registration to specific type not supported"); return -EINVAL; } /* * To support Pre Association Security Negotiation (PASN), registration * for authentication frames should be supported. However, as some * versions of the user space daemons wrongly register to all types of * authentication frames (which might result in unexpected behavior) * allow such registration if the request is for a specific * authentication algorithm number. */ if (wdev->iftype == NL80211_IFTYPE_STATION && (frame_type & IEEE80211_FCTL_STYPE) == IEEE80211_STYPE_AUTH && !(match_data && match_len >= 2)) { NL_SET_ERR_MSG(extack, "Authentication algorithm number required"); return -EINVAL; } nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL); if (!nreg) return -ENOMEM; spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { int mlen = min(match_len, reg->match_len); if (frame_type != le16_to_cpu(reg->frame_type)) continue; if (memcmp(reg->match, match_data, mlen) == 0) { if (reg->multicast_rx != multicast_rx) { update_multicast = true; reg->multicast_rx = multicast_rx; break; } NL_SET_ERR_MSG(extack, "Match already configured"); err = -EALREADY; break; } } if (err) goto out; if (update_multicast) { kfree(nreg); } else { memcpy(nreg->match, match_data, match_len); nreg->match_len = match_len; nreg->nlportid = snd_portid; nreg->frame_type = cpu_to_le16(frame_type); nreg->wdev = wdev; nreg->multicast_rx = multicast_rx; list_add(&nreg->list, &wdev->mgmt_registrations); } wdev->mgmt_registrations_need_update = 1; spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); return 0; out: kfree(nreg); spin_unlock_bh(&rdev->mgmt_registrations_lock); return err; } void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { if (reg->nlportid != nlportid) continue; list_del(®->list); kfree(reg); wdev->mgmt_registrations_need_update = 1; schedule_work(&rdev->mgmt_registrations_update_wk); } spin_unlock_bh(&rdev->mgmt_registrations_lock); if (nlportid && rdev->crit_proto_nlportid == nlportid) { rdev->crit_proto_nlportid = 0; rdev_crit_proto_stop(rdev, wdev); } if (nlportid == wdev->ap_unexpected_nlportid) wdev->ap_unexpected_nlportid = 0; } void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { list_del(®->list); kfree(reg); } wdev->mgmt_registrations_need_update = 1; spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); } static bool cfg80211_allowed_address(struct wireless_dev *wdev, const u8 *addr) { int i; for_each_valid_link(wdev, i) { if (ether_addr_equal(addr, wdev->links[i].addr)) return true; } return ether_addr_equal(addr, wdev_address(wdev)); } static bool cfg80211_allowed_random_address(struct wireless_dev *wdev, const struct ieee80211_mgmt *mgmt) { if (ieee80211_is_auth(mgmt->frame_control) || ieee80211_is_deauth(mgmt->frame_control)) { /* Allow random TA to be used with authentication and * deauthentication frames if the driver has indicated support. */ if (wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA)) return true; } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) { /* Allow random TA to be used with Public Action frames if the * driver has indicated support. */ if (!wdev->connected && wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA)) return true; if (wdev->connected && wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED)) return true; } return false; } int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { const struct ieee80211_mgmt *mgmt; u16 stype; lockdep_assert_wiphy(&rdev->wiphy); if (!wdev->wiphy->mgmt_stypes) return -EOPNOTSUPP; if (!rdev->ops->mgmt_tx) return -EOPNOTSUPP; if (params->len < 24 + 1) return -EINVAL; mgmt = (const struct ieee80211_mgmt *)params->buf; if (!ieee80211_is_mgmt(mgmt->frame_control)) return -EINVAL; stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE; if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4))) return -EINVAL; if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) { int err = 0; switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: /* * check for IBSS DA must be done by driver as * cfg80211 doesn't track the stations */ if (!wdev->u.ibss.current_bss || !ether_addr_equal(wdev->u.ibss.current_bss->pub.bssid, mgmt->bssid)) { err = -ENOTCONN; break; } break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: if (!wdev->connected) { err = -ENOTCONN; break; } /* FIXME: MLD may address this differently */ if (!ether_addr_equal(wdev->u.client.connected_addr, mgmt->bssid)) { err = -ENOTCONN; break; } /* for station, check that DA is the AP */ if (!ether_addr_equal(wdev->u.client.connected_addr, mgmt->da)) { err = -ENOTCONN; break; } break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP_VLAN: if (!ether_addr_equal(mgmt->bssid, wdev_address(wdev)) && (params->link_id < 0 || !ether_addr_equal(mgmt->bssid, wdev->links[params->link_id].addr))) err = -EINVAL; break; case NL80211_IFTYPE_MESH_POINT: if (!ether_addr_equal(mgmt->sa, mgmt->bssid)) { err = -EINVAL; break; } /* * check for mesh DA must be done by driver as * cfg80211 doesn't track the stations */ break; case NL80211_IFTYPE_P2P_DEVICE: /* * fall through, P2P device only supports * public action frames */ case NL80211_IFTYPE_NAN: default: err = -EOPNOTSUPP; break; } if (err) return err; } if (!cfg80211_allowed_address(wdev, mgmt->sa) && !cfg80211_allowed_random_address(wdev, mgmt)) return -EINVAL; /* Transmit the management frame as requested by user space */ return rdev_mgmt_tx(rdev, wdev, params, cookie); } bool cfg80211_rx_mgmt_ext(struct wireless_dev *wdev, struct cfg80211_rx_info *info) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg; const struct ieee80211_txrx_stypes *stypes = &wiphy->mgmt_stypes[wdev->iftype]; struct ieee80211_mgmt *mgmt = (void *)info->buf; const u8 *data; int data_len; bool result = false; __le16 ftype = mgmt->frame_control & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE); u16 stype; trace_cfg80211_rx_mgmt(wdev, info); stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4; if (!(stypes->rx & BIT(stype))) { trace_cfg80211_return_bool(false); return false; } data = info->buf + ieee80211_hdrlen(mgmt->frame_control); data_len = info->len - ieee80211_hdrlen(mgmt->frame_control); spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { if (reg->frame_type != ftype) continue; if (reg->match_len > data_len) continue; if (memcmp(reg->match, data, reg->match_len)) continue; /* found match! */ /* Indicate the received Action frame to user space */ if (nl80211_send_mgmt(rdev, wdev, reg->nlportid, info, GFP_ATOMIC)) continue; result = true; break; } spin_unlock_bh(&rdev->mgmt_registrations_lock); trace_cfg80211_return_bool(result); return result; } EXPORT_SYMBOL(cfg80211_rx_mgmt_ext); void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev) { cancel_delayed_work(&rdev->dfs_update_channels_wk); queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, 0); } void cfg80211_dfs_channels_update_work(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct cfg80211_registered_device *rdev; struct cfg80211_chan_def chandef; struct ieee80211_supported_band *sband; struct ieee80211_channel *c; struct wiphy *wiphy; bool check_again = false; unsigned long timeout, next_time = 0; unsigned long time_dfs_update; enum nl80211_radar_event radar_event; int bandid, i; rdev = container_of(delayed_work, struct cfg80211_registered_device, dfs_update_channels_wk); wiphy = &rdev->wiphy; rtnl_lock(); for (bandid = 0; bandid < NUM_NL80211_BANDS; bandid++) { sband = wiphy->bands[bandid]; if (!sband) continue; for (i = 0; i < sband->n_channels; i++) { c = &sband->channels[i]; if (!(c->flags & IEEE80211_CHAN_RADAR)) continue; if (c->dfs_state != NL80211_DFS_UNAVAILABLE && c->dfs_state != NL80211_DFS_AVAILABLE) continue; if (c->dfs_state == NL80211_DFS_UNAVAILABLE) { time_dfs_update = IEEE80211_DFS_MIN_NOP_TIME_MS; radar_event = NL80211_RADAR_NOP_FINISHED; } else { if (regulatory_pre_cac_allowed(wiphy) || cfg80211_any_wiphy_oper_chan(wiphy, c)) continue; time_dfs_update = REG_PRE_CAC_EXPIRY_GRACE_MS; radar_event = NL80211_RADAR_PRE_CAC_EXPIRED; } timeout = c->dfs_state_entered + msecs_to_jiffies(time_dfs_update); if (time_after_eq(jiffies, timeout)) { c->dfs_state = NL80211_DFS_USABLE; c->dfs_state_entered = jiffies; cfg80211_chandef_create(&chandef, c, NL80211_CHAN_NO_HT); nl80211_radar_notify(rdev, &chandef, radar_event, NULL, GFP_ATOMIC); regulatory_propagate_dfs_state(wiphy, &chandef, c->dfs_state, radar_event); continue; } if (!check_again) next_time = timeout - jiffies; else next_time = min(next_time, timeout - jiffies); check_again = true; } } rtnl_unlock(); /* reschedule if there are other channels waiting to be cleared again */ if (check_again) queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, next_time); } void __cfg80211_radar_event(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, bool offchan, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); trace_cfg80211_radar_event(wiphy, chandef, offchan); /* only set the chandef supplied channel to unavailable, in * case the radar is detected on only one of multiple channels * spanned by the chandef. */ cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_UNAVAILABLE); if (offchan) queue_work(cfg80211_wq, &rdev->background_cac_abort_wk); cfg80211_sched_dfs_chan_update(rdev); nl80211_radar_notify(rdev, chandef, NL80211_RADAR_DETECTED, NULL, gfp); memcpy(&rdev->radar_chandef, chandef, sizeof(struct cfg80211_chan_def)); queue_work(cfg80211_wq, &rdev->propagate_radar_detect_wk); } EXPORT_SYMBOL(__cfg80211_radar_event); void cfg80211_cac_event(struct net_device *netdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event, gfp_t gfp, unsigned int link_id) { struct wireless_dev *wdev = netdev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); unsigned long timeout; if (WARN_ON(wdev->valid_links && !(wdev->valid_links & BIT(link_id)))) return; trace_cfg80211_cac_event(netdev, event, link_id); if (WARN_ON(!wdev->links[link_id].cac_started && event != NL80211_RADAR_CAC_STARTED)) return; switch (event) { case NL80211_RADAR_CAC_FINISHED: timeout = wdev->links[link_id].cac_start_time + msecs_to_jiffies(wdev->links[link_id].cac_time_ms); WARN_ON(!time_after_eq(jiffies, timeout)); cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); memcpy(&rdev->cac_done_chandef, chandef, sizeof(struct cfg80211_chan_def)); queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); cfg80211_sched_dfs_chan_update(rdev); fallthrough; case NL80211_RADAR_CAC_ABORTED: wdev->links[link_id].cac_started = false; break; case NL80211_RADAR_CAC_STARTED: wdev->links[link_id].cac_started = true; break; default: WARN_ON(1); return; } nl80211_radar_notify(rdev, chandef, event, netdev, gfp); } EXPORT_SYMBOL(cfg80211_cac_event); static void __cfg80211_background_cac_event(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event) { struct wiphy *wiphy = &rdev->wiphy; struct net_device *netdev; lockdep_assert_wiphy(&rdev->wiphy); if (!cfg80211_chandef_valid(chandef)) return; if (!rdev->background_radar_wdev) return; switch (event) { case NL80211_RADAR_CAC_FINISHED: cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); memcpy(&rdev->cac_done_chandef, chandef, sizeof(*chandef)); queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); cfg80211_sched_dfs_chan_update(rdev); wdev = rdev->background_radar_wdev; break; case NL80211_RADAR_CAC_ABORTED: if (!cancel_delayed_work(&rdev->background_cac_done_wk)) return; wdev = rdev->background_radar_wdev; break; case NL80211_RADAR_CAC_STARTED: break; default: return; } netdev = wdev ? wdev->netdev : NULL; nl80211_radar_notify(rdev, chandef, event, netdev, GFP_KERNEL); } static void cfg80211_background_cac_event(struct cfg80211_registered_device *rdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event) { wiphy_lock(&rdev->wiphy); __cfg80211_background_cac_event(rdev, rdev->background_radar_wdev, chandef, event); wiphy_unlock(&rdev->wiphy); } void cfg80211_background_cac_done_wk(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); struct cfg80211_registered_device *rdev; rdev = container_of(delayed_work, struct cfg80211_registered_device, background_cac_done_wk); cfg80211_background_cac_event(rdev, &rdev->background_radar_chandef, NL80211_RADAR_CAC_FINISHED); } void cfg80211_background_cac_abort_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; rdev = container_of(work, struct cfg80211_registered_device, background_cac_abort_wk); cfg80211_background_cac_event(rdev, &rdev->background_radar_chandef, NL80211_RADAR_CAC_ABORTED); } void cfg80211_background_cac_abort(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); queue_work(cfg80211_wq, &rdev->background_cac_abort_wk); } EXPORT_SYMBOL(cfg80211_background_cac_abort); int cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef) { unsigned int cac_time_ms; int err; lockdep_assert_wiphy(&rdev->wiphy); if (!wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_RADAR_BACKGROUND)) return -EOPNOTSUPP; /* Offchannel chain already locked by another wdev */ if (rdev->background_radar_wdev && rdev->background_radar_wdev != wdev) return -EBUSY; /* CAC already in progress on the offchannel chain */ if (rdev->background_radar_wdev == wdev && delayed_work_pending(&rdev->background_cac_done_wk)) return -EBUSY; err = rdev_set_radar_background(rdev, chandef); if (err) return err; cac_time_ms = cfg80211_chandef_dfs_cac_time(&rdev->wiphy, chandef); if (!cac_time_ms) cac_time_ms = IEEE80211_DFS_MIN_CAC_TIME_MS; rdev->background_radar_chandef = *chandef; rdev->background_radar_wdev = wdev; /* Get offchain ownership */ __cfg80211_background_cac_event(rdev, wdev, chandef, NL80211_RADAR_CAC_STARTED); queue_delayed_work(cfg80211_wq, &rdev->background_cac_done_wk, msecs_to_jiffies(cac_time_ms)); return 0; } void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); lockdep_assert_wiphy(wiphy); if (wdev != rdev->background_radar_wdev) return; rdev_set_radar_background(rdev, NULL); rdev->background_radar_wdev = NULL; /* Release offchain ownership */ __cfg80211_background_cac_event(rdev, wdev, &rdev->background_radar_chandef, NL80211_RADAR_CAC_ABORTED); } |
9 7 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CONTEXT_TRACKING_H #define _LINUX_CONTEXT_TRACKING_H #include <linux/sched.h> #include <linux/vtime.h> #include <linux/context_tracking_state.h> #include <linux/instrumentation.h> #include <asm/ptrace.h> #ifdef CONFIG_CONTEXT_TRACKING_USER extern void ct_cpu_track_user(int cpu); /* Called with interrupts disabled. */ extern void __ct_user_enter(enum ctx_state state); extern void __ct_user_exit(enum ctx_state state); extern void ct_user_enter(enum ctx_state state); extern void ct_user_exit(enum ctx_state state); extern void user_enter_callable(void); extern void user_exit_callable(void); static inline void user_enter(void) { if (context_tracking_enabled()) ct_user_enter(CT_STATE_USER); } static inline void user_exit(void) { if (context_tracking_enabled()) ct_user_exit(CT_STATE_USER); } /* Called with interrupts disabled. */ static __always_inline void user_enter_irqoff(void) { if (context_tracking_enabled()) __ct_user_enter(CT_STATE_USER); } static __always_inline void user_exit_irqoff(void) { if (context_tracking_enabled()) __ct_user_exit(CT_STATE_USER); } static inline enum ctx_state exception_enter(void) { enum ctx_state prev_ctx; if (IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) || !context_tracking_enabled()) return 0; prev_ctx = __ct_state(); if (prev_ctx != CT_STATE_KERNEL) ct_user_exit(prev_ctx); return prev_ctx; } static inline void exception_exit(enum ctx_state prev_ctx) { if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) && context_tracking_enabled()) { if (prev_ctx != CT_STATE_KERNEL) ct_user_enter(prev_ctx); } } static __always_inline bool context_tracking_guest_enter(void) { if (context_tracking_enabled()) __ct_user_enter(CT_STATE_GUEST); return context_tracking_enabled_this_cpu(); } static __always_inline bool context_tracking_guest_exit(void) { if (context_tracking_enabled()) __ct_user_exit(CT_STATE_GUEST); return context_tracking_enabled_this_cpu(); } #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond)) #else static inline void user_enter(void) { } static inline void user_exit(void) { } static inline void user_enter_irqoff(void) { } static inline void user_exit_irqoff(void) { } static inline int exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } static inline int ct_state(void) { return -1; } static inline int __ct_state(void) { return -1; } static __always_inline bool context_tracking_guest_enter(void) { return false; } static __always_inline bool context_tracking_guest_exit(void) { return false; } #define CT_WARN_ON(cond) do { } while (0) #endif /* !CONFIG_CONTEXT_TRACKING_USER */ #ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE extern void context_tracking_init(void); #else static inline void context_tracking_init(void) { } #endif /* CONFIG_CONTEXT_TRACKING_USER_FORCE */ #ifdef CONFIG_CONTEXT_TRACKING_IDLE extern void ct_idle_enter(void); extern void ct_idle_exit(void); /* * Is RCU watching the current CPU (IOW, it is not in an extended quiescent state)? * * Note that this returns the actual boolean data (watching / not watching), * whereas ct_rcu_watching() returns the RCU_WATCHING subvariable of * context_tracking.state. * * No ordering, as we are sampling CPU-local information. */ static __always_inline bool rcu_is_watching_curr_cpu(void) { return raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING; } /* * Increment the current CPU's context_tracking structure's ->state field * with ordering. Return the new value. */ static __always_inline unsigned long ct_state_inc(int incby) { return raw_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state)); } static __always_inline bool warn_rcu_enter(void) { bool ret = false; /* * Horrible hack to shut up recursive RCU isn't watching fail since * lots of the actual reporting also relies on RCU. */ preempt_disable_notrace(); if (!rcu_is_watching_curr_cpu()) { ret = true; ct_state_inc(CT_RCU_WATCHING); } return ret; } static __always_inline void warn_rcu_exit(bool rcu) { if (rcu) ct_state_inc(CT_RCU_WATCHING); preempt_enable_notrace(); } #else static inline void ct_idle_enter(void) { } static inline void ct_idle_exit(void) { } static __always_inline bool warn_rcu_enter(void) { return false; } static __always_inline void warn_rcu_exit(bool rcu) { } #endif /* !CONFIG_CONTEXT_TRACKING_IDLE */ #endif |
7 7 7 6 6 6 9 9 10 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2018-2020 Christoph Hellwig. * * DMA operations that map physical memory directly without using an IOMMU. */ #include <linux/memblock.h> /* for max_pfn */ #include <linux/export.h> #include <linux/mm.h> #include <linux/dma-map-ops.h> #include <linux/scatterlist.h> #include <linux/pfn.h> #include <linux/vmalloc.h> #include <linux/set_memory.h> #include <linux/slab.h> #include "direct.h" /* * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use * it for entirely different regions. In that case the arch code needs to * override the variable below for dma-direct to work properly. */ u64 zone_dma_limit __ro_after_init = DMA_BIT_MASK(24); static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { if (force_dma_unencrypted(dev)) return phys_to_dma_unencrypted(dev, phys); return phys_to_dma(dev, phys); } static inline struct page *dma_direct_to_page(struct device *dev, dma_addr_t dma_addr) { return pfn_to_page(PHYS_PFN(dma_to_phys(dev, dma_addr))); } u64 dma_direct_get_required_mask(struct device *dev) { phys_addr_t phys = (phys_addr_t)(max_pfn - 1) << PAGE_SHIFT; u64 max_dma = phys_to_dma_direct(dev, phys); return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } static gfp_t dma_direct_optimal_gfp_mask(struct device *dev, u64 *phys_limit) { u64 dma_limit = min_not_zero( dev->coherent_dma_mask, dev->bus_dma_limit); /* * Optimistically try the zone that the physical address mask falls * into first. If that returns memory that isn't actually addressable * we will fallback to the next lower zone and try again. * * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding * zones. */ *phys_limit = dma_to_phys(dev, dma_limit); if (*phys_limit <= zone_dma_limit) return GFP_DMA; if (*phys_limit <= DMA_BIT_MASK(32)) return GFP_DMA32; return 0; } bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { dma_addr_t dma_addr = phys_to_dma_direct(dev, phys); if (dma_addr == DMA_MAPPING_ERROR) return false; return dma_addr + size - 1 <= min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size) { if (!force_dma_unencrypted(dev)) return 0; return set_memory_decrypted((unsigned long)vaddr, PFN_UP(size)); } static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) { int ret; if (!force_dma_unencrypted(dev)) return 0; ret = set_memory_encrypted((unsigned long)vaddr, PFN_UP(size)); if (ret) pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n"); return ret; } static void __dma_direct_free_pages(struct device *dev, struct page *page, size_t size) { if (swiotlb_free(dev, page, size)) return; dma_free_contiguous(dev, page, size); } static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size) { struct page *page = swiotlb_alloc(dev, size); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { swiotlb_free(dev, page, size); return NULL; } return page; } static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp, bool allow_highmem) { int node = dev_to_node(dev); struct page *page = NULL; u64 phys_limit; WARN_ON_ONCE(!PAGE_ALIGNED(size)); if (is_swiotlb_for_alloc(dev)) return dma_direct_alloc_swiotlb(dev, size); gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit); page = dma_alloc_contiguous(dev, size, gfp); if (page) { if (!dma_coherent_ok(dev, page_to_phys(page), size) || (!allow_highmem && PageHighMem(page))) { dma_free_contiguous(dev, page, size); page = NULL; } } again: if (!page) page = alloc_pages_node(node, gfp, get_order(size)); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { __free_pages(page, get_order(size)); page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && phys_limit < DMA_BIT_MASK(64) && !(gfp & (GFP_DMA32 | GFP_DMA))) { gfp |= GFP_DMA32; goto again; } if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } } return page; } /* * Check if a potentially blocking operations needs to dip into the atomic * pools for the given device/gfp. */ static bool dma_direct_use_pool(struct device *dev, gfp_t gfp) { return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev); } static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { struct page *page; u64 phys_limit; void *ret; if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))) return NULL; gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit); page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok); if (!page) return NULL; *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return ret; } static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { struct page *page; page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true); if (!page) return NULL; /* remove any dirty cache lines on the kernel alias */ if (!PageHighMem(page)) arch_dma_prep_coherent(page, size); /* return the page pointer as the opaque cookie */ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return page; } void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { bool remap = false, set_uncached = false; struct page *page; void *ret; size = PAGE_ALIGN(size); if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp); if (!dev_is_dma_coherent(dev)) { if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) && !is_swiotlb_for_alloc(dev)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); /* * If there is a global pool, always allocate from it for * non-coherent devices. */ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL)) return dma_alloc_from_global_coherent(dev, size, dma_handle); /* * Otherwise we require the architecture to either be able to * mark arbitrary parts of the kernel direct mapping uncached, * or remapped it uncached. */ set_uncached = IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED); remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP); if (!set_uncached && !remap) { pr_warn_once("coherent DMA allocations not supported on this platform.\n"); return NULL; } } /* * Remapping or decrypting memory may block, allocate the memory from * the atomic pools instead if we aren't allowed block. */ if ((remap || force_dma_unencrypted(dev)) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true); if (!page) return NULL; /* * dma_alloc_contiguous can return highmem pages depending on a * combination the cma= arguments and per-arch setup. These need to be * remapped to return a kernel virtual address. */ if (PageHighMem(page)) { remap = true; set_uncached = false; } if (remap) { pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs); if (force_dma_unencrypted(dev)) prot = pgprot_decrypted(prot); /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, prot, __builtin_return_address(0)); if (!ret) goto out_free_pages; } else { ret = page_address(page); if (dma_set_decrypted(dev, ret, size)) goto out_leak_pages; } memset(ret, 0, size); if (set_uncached) { arch_dma_prep_coherent(page, size); ret = arch_dma_set_uncached(ret, size); if (IS_ERR(ret)) goto out_encrypt_pages; } *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return ret; out_encrypt_pages: if (dma_set_encrypted(dev, page_address(page), size)) return NULL; out_free_pages: __dma_direct_free_pages(dev, page, size); return NULL; out_leak_pages: return NULL; } void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { unsigned int page_order = get_order(size); if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ dma_free_contiguous(dev, cpu_addr, size); return; } if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_ALLOC) && !dev_is_dma_coherent(dev) && !is_swiotlb_for_alloc(dev)) { arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); return; } if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && !dev_is_dma_coherent(dev)) { if (!dma_release_from_global_coherent(page_order, cpu_addr)) WARN_ON_ONCE(1); return; } /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) return; if (is_vmalloc_addr(cpu_addr)) { vunmap(cpu_addr); } else { if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) arch_dma_clear_uncached(cpu_addr, size); if (dma_set_encrypted(dev, cpu_addr, size)) return; } __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size); } struct page *dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { struct page *page; void *ret; if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); page = __dma_direct_alloc_pages(dev, size, gfp, false); if (!page) return NULL; ret = page_address(page); if (dma_set_decrypted(dev, ret, size)) goto out_leak_pages; memset(ret, 0, size); *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return page; out_leak_pages: return NULL; } void dma_direct_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_addr, enum dma_data_direction dir) { void *vaddr = page_address(page); /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && dma_free_from_pool(dev, vaddr, size)) return; if (dma_set_encrypted(dev, vaddr, size)) return; __dma_direct_free_pages(dev, page, size); } #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_SWIOTLB) void dma_direct_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { struct scatterlist *sg; int i; for_each_sg(sgl, sg, nents, i) { phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); swiotlb_sync_single_for_device(dev, paddr, sg->length, dir); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(paddr, sg->length, dir); } } #endif #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ defined(CONFIG_SWIOTLB) void dma_direct_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir) { struct scatterlist *sg; int i; for_each_sg(sgl, sg, nents, i) { phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu(paddr, sg->length, dir); swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir); if (dir == DMA_FROM_DEVICE) arch_dma_mark_clean(paddr, sg->length); } if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu_all(); } /* * Unmaps segments, except for ones marked as pci_p2pdma which do not * require any further action as they contain a bus address. */ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *sg; int i; for_each_sg(sgl, sg, nents, i) { if (sg_dma_is_bus_address(sg)) sg_dma_unmark_bus_address(sg); else dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir, attrs); } } #endif int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction dir, unsigned long attrs) { struct pci_p2pdma_map_state p2pdma_state = {}; enum pci_p2pdma_map_type map; struct scatterlist *sg; int i, ret; for_each_sg(sgl, sg, nents, i) { if (is_pci_p2pdma_page(sg_page(sg))) { map = pci_p2pdma_map_segment(&p2pdma_state, dev, sg); switch (map) { case PCI_P2PDMA_MAP_BUS_ADDR: continue; case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: /* * Any P2P mapping that traverses the PCI * host bridge must be mapped with CPU physical * address and not PCI bus addresses. This is * done with dma_direct_map_page() below. */ break; default: ret = -EREMOTEIO; goto out_unmap; } } sg->dma_address = dma_direct_map_page(dev, sg_page(sg), sg->offset, sg->length, dir, attrs); if (sg->dma_address == DMA_MAPPING_ERROR) { ret = -EIO; goto out_unmap; } sg_dma_len(sg) = sg->length; } return nents; out_unmap: dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return ret; } dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, size_t size, enum dma_data_direction dir, unsigned long attrs) { dma_addr_t dma_addr = paddr; if (unlikely(!dma_capable(dev, dma_addr, size, false))) { dev_err_once(dev, "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); WARN_ON_ONCE(1); return DMA_MAPPING_ERROR; } return dma_addr; } int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { struct page *page = dma_direct_to_page(dev, dma_addr); int ret; ret = sg_alloc_table(sgt, 1, GFP_KERNEL); if (!ret) sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); return ret; } bool dma_direct_can_mmap(struct device *dev) { return dev_is_dma_coherent(dev) || IS_ENABLED(CONFIG_DMA_NONCOHERENT_MMAP); } int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr)); int ret = -ENXIO; vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); if (force_dma_unencrypted(dev)) vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; if (dma_mmap_from_global_coherent(vma, cpu_addr, size, &ret)) return ret; if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff) return -ENXIO; return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, user_count << PAGE_SHIFT, vma->vm_page_prot); } int dma_direct_supported(struct device *dev, u64 mask) { u64 min_mask = (max_pfn - 1) << PAGE_SHIFT; /* * Because 32-bit DMA masks are so common we expect every architecture * to be able to satisfy them - either by not supporting more physical * memory, or by providing a ZONE_DMA32. If neither is the case, the * architecture needs to use an IOMMU instead of the direct mapping. */ if (mask >= DMA_BIT_MASK(32)) return 1; /* * This check needs to be against the actual bit mask value, so use * phys_to_dma_unencrypted() here so that the SME encryption mask isn't * part of the check. */ if (IS_ENABLED(CONFIG_ZONE_DMA)) min_mask = min_t(u64, min_mask, zone_dma_limit); return mask >= phys_to_dma_unencrypted(dev, min_mask); } /* * To check whether all ram resource ranges are covered by dma range map * Returns 0 when further check is needed * Returns 1 if there is some RAM range can't be covered by dma_range_map */ static int check_ram_in_range_map(unsigned long start_pfn, unsigned long nr_pages, void *data) { unsigned long end_pfn = start_pfn + nr_pages; const struct bus_dma_region *bdr = NULL; const struct bus_dma_region *m; struct device *dev = data; while (start_pfn < end_pfn) { for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) { unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start); if (start_pfn >= cpu_start_pfn && start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) { bdr = m; break; } } if (!bdr) return 1; start_pfn = PFN_DOWN(bdr->cpu_start) + PFN_DOWN(bdr->size); } return 0; } bool dma_direct_all_ram_mapped(struct device *dev) { if (!dev->dma_range_map) return true; return !walk_system_ram_range(0, PFN_DOWN(ULONG_MAX) + 1, dev, check_ram_in_range_map); } size_t dma_direct_max_mapping_size(struct device *dev) { /* If SWIOTLB is active, use its maximum mapping size */ if (is_swiotlb_active(dev) && (dma_addressing_limited(dev) || is_swiotlb_force_bounce(dev))) return swiotlb_max_mapping_size(dev); return SIZE_MAX; } bool dma_direct_need_sync(struct device *dev, dma_addr_t dma_addr) { return !dev_is_dma_coherent(dev) || swiotlb_find_pool(dev, dma_to_phys(dev, dma_addr)); } /** * dma_direct_set_offset - Assign scalar offset for a single DMA range. * @dev: device pointer; needed to "own" the alloced memory. * @cpu_start: beginning of memory region covered by this offset. * @dma_start: beginning of DMA/PCI region covered by this offset. * @size: size of the region. * * This is for the simple case of a uniform offset which cannot * be discovered by "dma-ranges". * * It returns -ENOMEM if out of memory, -EINVAL if a map * already exists, 0 otherwise. * * Note: any call to this from a driver is a bug. The mapping needs * to be described by the device tree or other firmware interfaces. */ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start, dma_addr_t dma_start, u64 size) { struct bus_dma_region *map; u64 offset = (u64)cpu_start - (u64)dma_start; if (dev->dma_range_map) { dev_err(dev, "attempt to add DMA range to existing map\n"); return -EINVAL; } if (!offset) return 0; map = kcalloc(2, sizeof(*map), GFP_KERNEL); if (!map) return -ENOMEM; map[0].cpu_start = cpu_start; map[0].dma_start = dma_start; map[0].size = size; dev->dma_range_map = map; return 0; } |
8 8 8 8 8 8 3 5 10 1 1 5 3 2 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> */ #include <linux/filter.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/bpf.h> #include <net/lwtunnel.h> #include <net/gre.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/ipv6_stubs.h> #include <net/inet_dscp.h> struct bpf_lwt_prog { struct bpf_prog *prog; char *name; }; struct bpf_lwt { struct bpf_lwt_prog in; struct bpf_lwt_prog out; struct bpf_lwt_prog xmit; int family; }; #define MAX_PROG_NAME 256 static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) { return (struct bpf_lwt *)lwt->data; } #define NO_REDIRECT false #define CAN_REDIRECT true static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, struct dst_entry *dst, bool can_redirect) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int ret; /* Disabling BH is needed to protect per-CPU bpf_redirect_info between * BPF prog and skb_do_redirect(). */ local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); bpf_compute_data_pointers(skb); ret = bpf_prog_run_save_cb(lwt->prog, skb); switch (ret) { case BPF_OK: case BPF_LWT_REROUTE: break; case BPF_REDIRECT: if (unlikely(!can_redirect)) { pr_warn_once("Illegal redirect return code in prog %s\n", lwt->name ? : "<unknown>"); ret = BPF_OK; } else { skb_reset_mac_header(skb); skb_do_redirect(skb); ret = BPF_REDIRECT; } break; case BPF_DROP: kfree_skb(skb); ret = -EPERM; break; default: pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); kfree_skb(skb); ret = -EINVAL; break; } bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); return ret; } static int bpf_lwt_input_reroute(struct sk_buff *skb) { enum skb_drop_reason reason; int err = -EINVAL; if (skb->protocol == htons(ETH_P_IP)) { struct net_device *dev = skb_dst(skb)->dev; const struct iphdr *iph = ip_hdr(skb); dev_hold(dev); skb_dst_drop(skb); reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); err = reason ? -EINVAL : 0; dev_put(dev); } else if (skb->protocol == htons(ETH_P_IPV6)) { skb_dst_drop(skb); err = ipv6_stub->ipv6_route_input(skb); } else { err = -EAFNOSUPPORT; } if (err) goto err; return dst_input(skb); err: kfree_skb(skb); return err; } static int bpf_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct bpf_lwt *bpf; int ret; bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->in.prog) { ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); if (ret < 0) return ret; if (ret == BPF_LWT_REROUTE) return bpf_lwt_input_reroute(skb); } if (unlikely(!dst->lwtstate->orig_input)) { kfree_skb(skb); return -EINVAL; } return dst->lwtstate->orig_input(skb); } static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct bpf_lwt *bpf; int ret; bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->out.prog) { ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); if (ret < 0) return ret; } if (unlikely(!dst->lwtstate->orig_output)) { pr_warn_once("orig_output not set on dst for prog %s\n", bpf->out.name); kfree_skb(skb); return -EINVAL; } return dst->lwtstate->orig_output(net, sk, skb); } static int xmit_check_hhlen(struct sk_buff *skb, int hh_len) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) return -ENOMEM; } return 0; } static int bpf_lwt_xmit_reroute(struct sk_buff *skb) { struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); int oif = l3mdev ? l3mdev->ifindex : 0; struct dst_entry *dst = NULL; int err = -EAFNOSUPPORT; struct sock *sk; struct net *net; bool ipv4; if (skb->protocol == htons(ETH_P_IP)) ipv4 = true; else if (skb->protocol == htons(ETH_P_IPV6)) ipv4 = false; else goto err; sk = sk_to_full_sk(skb->sk); if (sk) { if (sk->sk_bound_dev_if) oif = sk->sk_bound_dev_if; net = sock_net(sk); } else { net = dev_net(skb_dst(skb)->dev); } if (ipv4) { struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = {}; struct rtable *rt; fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; fl4.saddr = iph->saddr; rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto err; } dst = &rt->dst; } else { struct ipv6hdr *iph6 = ipv6_hdr(skb); struct flowi6 fl6 = {}; fl6.flowi6_oif = oif; fl6.flowi6_mark = skb->mark; fl6.flowi6_uid = sock_net_uid(net, sk); fl6.flowlabel = ip6_flowinfo(iph6); fl6.flowi6_proto = iph6->nexthdr; fl6.daddr = iph6->daddr; fl6.saddr = iph6->saddr; dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto err; } } if (unlikely(dst->error)) { err = dst->error; dst_release(dst); goto err; } /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it * was done for the previous dst, so we are doing it here again, in * case the new dst needs much more space. The call below is a noop * if there is enough header space in skb. */ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) goto err; skb_dst_drop(skb); skb_dst_set(skb, dst); err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(err)) return net_xmit_errno(err); /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ return LWTUNNEL_XMIT_DONE; err: kfree_skb(skb); return err; } static int bpf_xmit(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct bpf_lwt *bpf; bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->xmit.prog) { int hh_len = dst->dev->hard_header_len; __be16 proto = skb->protocol; int ret; ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); switch (ret) { case BPF_OK: /* If the header changed, e.g. via bpf_lwt_push_encap, * BPF_LWT_REROUTE below should have been used if the * protocol was also changed. */ if (skb->protocol != proto) { kfree_skb(skb); return -EINVAL; } /* If the header was expanded, headroom might be too * small for L2 header to come, expand as needed. */ ret = xmit_check_hhlen(skb, hh_len); if (unlikely(ret)) return ret; return LWTUNNEL_XMIT_CONTINUE; case BPF_REDIRECT: return LWTUNNEL_XMIT_DONE; case BPF_LWT_REROUTE: return bpf_lwt_xmit_reroute(skb); default: return ret; } } return LWTUNNEL_XMIT_CONTINUE; } static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) { if (prog->prog) bpf_prog_put(prog->prog); kfree(prog->name); } static void bpf_destroy_state(struct lwtunnel_state *lwt) { struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); bpf_lwt_prog_destroy(&bpf->in); bpf_lwt_prog_destroy(&bpf->out); bpf_lwt_prog_destroy(&bpf->xmit); } static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, .len = MAX_PROG_NAME }, }; static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, enum bpf_prog_type type) { struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; struct bpf_prog *p; int ret; u32 fd; ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, NULL); if (ret < 0) return ret; if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) return -EINVAL; prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); if (!prog->name) return -ENOMEM; fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); p = bpf_prog_get_type(fd, type); if (IS_ERR(p)) return PTR_ERR(p); prog->prog = p; return 0; } static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { [LWT_BPF_IN] = { .type = NLA_NESTED, }, [LWT_BPF_OUT] = { .type = NLA_NESTED, }, [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, }; static int bpf_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_BPF_MAX + 1]; struct lwtunnel_state *newts; struct bpf_lwt *bpf; int ret; if (family != AF_INET && family != AF_INET6) return -EAFNOSUPPORT; ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); if (ret < 0) return ret; if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) return -EINVAL; newts = lwtunnel_state_alloc(sizeof(*bpf)); if (!newts) return -ENOMEM; newts->type = LWTUNNEL_ENCAP_BPF; bpf = bpf_lwt_lwtunnel(newts); if (tb[LWT_BPF_IN]) { newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, BPF_PROG_TYPE_LWT_IN); if (ret < 0) goto errout; } if (tb[LWT_BPF_OUT]) { newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, BPF_PROG_TYPE_LWT_OUT); if (ret < 0) goto errout; } if (tb[LWT_BPF_XMIT]) { newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, BPF_PROG_TYPE_LWT_XMIT); if (ret < 0) goto errout; } if (tb[LWT_BPF_XMIT_HEADROOM]) { u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); if (headroom > LWT_BPF_MAX_HEADROOM) { ret = -ERANGE; goto errout; } newts->headroom = headroom; } bpf->family = family; *ts = newts; return 0; errout: bpf_destroy_state(newts); kfree(newts); return ret; } static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, struct bpf_lwt_prog *prog) { struct nlattr *nest; if (!prog->prog) return 0; nest = nla_nest_start_noflag(skb, attr); if (!nest) return -EMSGSIZE; if (prog->name && nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) return -EMSGSIZE; return nla_nest_end(skb, nest); } static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) { struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) return -EMSGSIZE; return 0; } static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) { int nest_len = nla_total_size(sizeof(struct nlattr)) + nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ 0; return nest_len + /* LWT_BPF_IN */ nest_len + /* LWT_BPF_OUT */ nest_len + /* LWT_BPF_XMIT */ 0; } static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) { /* FIXME: * The LWT state is currently rebuilt for delete requests which * results in a new bpf_prog instance. Comparing names for now. */ if (!a->name && !b->name) return 0; if (!a->name || !b->name) return 1; return strcmp(a->name, b->name); } static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); } static const struct lwtunnel_encap_ops bpf_encap_ops = { .build_state = bpf_build_state, .destroy_state = bpf_destroy_state, .input = bpf_input, .output = bpf_output, .xmit = bpf_xmit, .fill_encap = bpf_fill_encap_info, .get_encap_size = bpf_encap_nlsize, .cmp_encap = bpf_encap_cmp, .owner = THIS_MODULE, }; static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, int encap_len) { struct skb_shared_info *shinfo = skb_shinfo(skb); gso_type |= SKB_GSO_DODGY; shinfo->gso_type |= gso_type; skb_decrease_gso_size(shinfo, encap_len); shinfo->gso_segs = 0; return 0; } static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) { int next_hdr_offset; void *next_hdr; __u8 protocol; /* SCTP and UDP_L4 gso need more nuanced handling than what * handle_gso_type() does above: skb_decrease_gso_size() is not enough. * So at the moment only TCP GSO packets are let through. */ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) return -ENOTSUPP; if (ipv4) { protocol = ip_hdr(skb)->protocol; next_hdr_offset = sizeof(struct iphdr); next_hdr = skb_network_header(skb) + next_hdr_offset; } else { protocol = ipv6_hdr(skb)->nexthdr; next_hdr_offset = sizeof(struct ipv6hdr); next_hdr = skb_network_header(skb) + next_hdr_offset; } switch (protocol) { case IPPROTO_GRE: next_hdr_offset += sizeof(struct gre_base_hdr); if (next_hdr_offset > encap_len) return -EINVAL; if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) return handle_gso_type(skb, SKB_GSO_GRE_CSUM, encap_len); return handle_gso_type(skb, SKB_GSO_GRE, encap_len); case IPPROTO_UDP: next_hdr_offset += sizeof(struct udphdr); if (next_hdr_offset > encap_len) return -EINVAL; if (((struct udphdr *)next_hdr)->check) return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, encap_len); return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); case IPPROTO_IP: case IPPROTO_IPV6: if (ipv4) return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); else return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); default: return -EPROTONOSUPPORT; } } int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) { struct iphdr *iph; bool ipv4; int err; if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) return -EINVAL; /* validate protocol and length */ iph = (struct iphdr *)hdr; if (iph->version == 4) { ipv4 = true; if (unlikely(len < iph->ihl * 4)) return -EINVAL; } else if (iph->version == 6) { ipv4 = false; if (unlikely(len < sizeof(struct ipv6hdr))) return -EINVAL; } else { return -EINVAL; } if (ingress) err = skb_cow_head(skb, len + skb->mac_len); else err = skb_cow_head(skb, len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); if (unlikely(err)) return err; /* push the encap headers and fix pointers */ skb_reset_inner_headers(skb); skb_reset_inner_mac_header(skb); /* mac header is not yet set */ skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_push(skb, len); if (ingress) skb_postpush_rcsum(skb, iph, len); skb_reset_network_header(skb); memcpy(skb_network_header(skb), hdr, len); bpf_compute_data_pointers(skb); skb_clear_hash(skb); if (ipv4) { skb->protocol = htons(ETH_P_IP); iph = ip_hdr(skb); if (!iph->check) iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } else { skb->protocol = htons(ETH_P_IPV6); } if (skb_is_gso(skb)) return handle_gso_encap(skb, ipv4, len); return 0; } static int __init bpf_lwt_init(void) { return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); } subsys_initcall(bpf_lwt_init) |
8 8 8 17 17 26 1 1 1 23 2 2 2 2 1 12 6 17 2 16 1 8 6 2 8 8 8 1737 1740 1743 406 168 2 12 12 17 6 12 17 17 17 4 9 2 2 4 9 2 2 52 52 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_mirred.c packet mirroring and redirect actions * * Authors: Jamal Hadi Salim (2002-4) * * TODO: Add ingress support (and socket redirect support) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/gfp.h> #include <linux/if_arp.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/dst.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <linux/tc_act/tc_mirred.h> #include <net/tc_act/tc_mirred.h> #include <net/tc_wrapper.h> static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); #define MIRRED_NEST_LIMIT 4 static DEFINE_PER_CPU(unsigned int, mirred_nest_level); static bool tcf_mirred_is_act_redirect(int action) { return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; } static bool tcf_mirred_act_wants_ingress(int action) { switch (action) { case TCA_EGRESS_REDIR: case TCA_EGRESS_MIRROR: return false; case TCA_INGRESS_REDIR: case TCA_INGRESS_MIRROR: return true; default: BUG(); } } static bool tcf_mirred_can_reinsert(int action) { switch (action) { case TC_ACT_SHOT: case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: return true; } return false; } static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m) { return rcu_dereference_protected(m->tcfm_dev, lockdep_is_held(&m->tcf_lock)); } static void tcf_mirred_release(struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; spin_lock(&mirred_list_lock); list_del(&m->tcfm_list); spin_unlock(&mirred_list_lock); /* last reference to action, no need to lock */ dev = rcu_dereference_protected(m->tcfm_dev, 1); netdev_put(dev, &m->tcfm_dev_tracker); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, [TCA_MIRRED_BLOCKID] = NLA_POLICY_MIN(NLA_U32, 1), }; static struct tc_action_ops act_mirred_ops; static void tcf_mirred_replace_dev(struct tcf_mirred *m, struct net_device *ndev) { struct net_device *odev; odev = rcu_replace_pointer(m->tcfm_dev, ndev, lockdep_is_held(&m->tcf_lock)); netdev_put(odev, &m->tcfm_dev_tracker); } static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_mirred_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tcf_chain *goto_ch = NULL; bool mac_header_xmit = false; struct tc_mirred *parm; struct tcf_mirred *m; bool exists = false; int ret, err; u32 index; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); return -EINVAL; } ret = nla_parse_nested_deprecated(tb, TCA_MIRRED_MAX, nla, mirred_policy, extack); if (ret < 0) return ret; if (!tb[TCA_MIRRED_PARMS]) { NL_SET_ERR_MSG_MOD(extack, "Missing required mirred parameters"); return -EINVAL; } parm = nla_data(tb[TCA_MIRRED_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (tb[TCA_MIRRED_BLOCKID] && parm->ifindex) { NL_SET_ERR_MSG_MOD(extack, "Cannot specify Block ID and dev simultaneously"); if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } switch (parm->eaction) { case TCA_EGRESS_MIRROR: case TCA_EGRESS_REDIR: case TCA_INGRESS_REDIR: case TCA_INGRESS_MIRROR: break; default: if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } if (!exists) { if (!parm->ifindex && !tb[TCA_MIRRED_BLOCKID]) { tcf_idr_cleanup(tn, index); NL_SET_ERR_MSG_MOD(extack, "Must specify device or block"); return -EINVAL; } ret = tcf_idr_create_from_flags(tn, index, est, a, &act_mirred_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } m = to_mirred(*a); if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&m->tcfm_list); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; spin_lock_bh(&m->tcf_lock); if (parm->ifindex) { struct net_device *ndev; ndev = dev_get_by_index(net, parm->ifindex); if (!ndev) { spin_unlock_bh(&m->tcf_lock); err = -ENODEV; goto put_chain; } mac_header_xmit = dev_is_mac_header_xmit(ndev); tcf_mirred_replace_dev(m, ndev); netdev_tracker_alloc(ndev, &m->tcfm_dev_tracker, GFP_ATOMIC); m->tcfm_mac_header_xmit = mac_header_xmit; m->tcfm_blockid = 0; } else if (tb[TCA_MIRRED_BLOCKID]) { tcf_mirred_replace_dev(m, NULL); m->tcfm_mac_header_xmit = false; m->tcfm_blockid = nla_get_u32(tb[TCA_MIRRED_BLOCKID]); } goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); m->tcfm_eaction = parm->eaction; spin_unlock_bh(&m->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (ret == ACT_P_CREATED) { spin_lock(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); spin_unlock(&mirred_list_lock); } return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_mirred_forward(bool at_ingress, bool want_ingress, struct sk_buff *skb) { int err; if (!want_ingress) err = tcf_dev_queue_xmit(skb, dev_queue_xmit); else if (!at_ingress) err = netif_rx(skb); else err = netif_receive_skb(skb); return err; } static int tcf_mirred_to_dev(struct sk_buff *skb, struct tcf_mirred *m, struct net_device *dev, const bool m_mac_header_xmit, int m_eaction, int retval) { struct sk_buff *skb_to_send = skb; bool want_ingress; bool is_redirect; bool expects_nh; bool at_ingress; bool dont_clone; int mac_len; bool at_nh; int err; is_redirect = tcf_mirred_is_act_redirect(m_eaction); if (unlikely(!(dev->flags & IFF_UP)) || !netif_carrier_ok(dev)) { net_notice_ratelimited("tc mirred to Houston: device %s is down\n", dev->name); goto err_cant_do; } /* we could easily avoid the clone only if called by ingress and clsact; * since we can't easily detect the clsact caller, skip clone only for * ingress - that covers the TC S/W datapath. */ at_ingress = skb_at_tc_ingress(skb); dont_clone = skb_at_tc_ingress(skb) && is_redirect && tcf_mirred_can_reinsert(retval); if (!dont_clone) { skb_to_send = skb_clone(skb, GFP_ATOMIC); if (!skb_to_send) goto err_cant_do; } want_ingress = tcf_mirred_act_wants_ingress(m_eaction); /* All mirred/redirected skbs should clear previous ct info */ nf_reset_ct(skb_to_send); if (want_ingress && !at_ingress) /* drop dst for egress -> ingress */ skb_dst_drop(skb_to_send); expects_nh = want_ingress || !m_mac_header_xmit; at_nh = skb->data == skb_network_header(skb); if (at_nh != expects_nh) { mac_len = at_ingress ? skb->mac_len : skb_network_offset(skb); if (expects_nh) { /* target device/action expect data at nh */ skb_pull_rcsum(skb_to_send, mac_len); } else { /* target device/action expect data at mac */ skb_push_rcsum(skb_to_send, mac_len); } } skb_to_send->skb_iif = skb->dev->ifindex; skb_to_send->dev = dev; if (is_redirect) { if (skb == skb_to_send) retval = TC_ACT_CONSUMED; skb_set_redirected(skb_to_send, skb_to_send->tc_at_ingress); err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send); } else { err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send); } if (err) tcf_action_inc_overlimit_qstats(&m->common); return retval; err_cant_do: if (is_redirect) retval = TC_ACT_SHOT; tcf_action_inc_overlimit_qstats(&m->common); return retval; } static int tcf_blockcast_redir(struct sk_buff *skb, struct tcf_mirred *m, struct tcf_block *block, int m_eaction, const u32 exception_ifindex, int retval) { struct net_device *dev_prev = NULL; struct net_device *dev = NULL; unsigned long index; int mirred_eaction; mirred_eaction = tcf_mirred_act_wants_ingress(m_eaction) ? TCA_INGRESS_MIRROR : TCA_EGRESS_MIRROR; xa_for_each(&block->ports, index, dev) { if (index == exception_ifindex) continue; if (!dev_prev) goto assign_prev; tcf_mirred_to_dev(skb, m, dev_prev, dev_is_mac_header_xmit(dev), mirred_eaction, retval); assign_prev: dev_prev = dev; } if (dev_prev) return tcf_mirred_to_dev(skb, m, dev_prev, dev_is_mac_header_xmit(dev_prev), m_eaction, retval); return retval; } static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m, struct tcf_block *block, int m_eaction, const u32 exception_ifindex, int retval) { struct net_device *dev = NULL; unsigned long index; xa_for_each(&block->ports, index, dev) { if (index == exception_ifindex) continue; tcf_mirred_to_dev(skb, m, dev, dev_is_mac_header_xmit(dev), m_eaction, retval); } return retval; } static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m, const u32 blockid, struct tcf_result *res, int retval) { const u32 exception_ifindex = skb->dev->ifindex; struct tcf_block *block; bool is_redirect; int m_eaction; m_eaction = READ_ONCE(m->tcfm_eaction); is_redirect = tcf_mirred_is_act_redirect(m_eaction); /* we are already under rcu protection, so can call block lookup * directly. */ block = tcf_block_lookup(dev_net(skb->dev), blockid); if (!block || xa_empty(&block->ports)) { tcf_action_inc_overlimit_qstats(&m->common); return retval; } if (is_redirect) return tcf_blockcast_redir(skb, m, block, m_eaction, exception_ifindex, retval); /* If it's not redirect, it is mirror */ return tcf_blockcast_mirror(skb, m, block, m_eaction, exception_ifindex, retval); } TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_mirred *m = to_mirred(a); int retval = READ_ONCE(m->tcf_action); unsigned int nest_level; bool m_mac_header_xmit; struct net_device *dev; int m_eaction; u32 blockid; nest_level = __this_cpu_inc_return(mirred_nest_level); if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); retval = TC_ACT_SHOT; goto dec_nest_level; } tcf_lastuse_update(&m->tcf_tm); tcf_action_update_bstats(&m->common, skb); blockid = READ_ONCE(m->tcfm_blockid); if (blockid) { retval = tcf_blockcast(skb, m, blockid, res, retval); goto dec_nest_level; } dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); tcf_action_inc_overlimit_qstats(&m->common); goto dec_nest_level; } m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction, retval); dec_nest_level: __this_cpu_dec(mirred_nest_level); return retval; } static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_mirred *m = to_mirred(a); struct tcf_t *tm = &m->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_mirred *m = to_mirred(a); struct tc_mirred opt = { .index = m->tcf_index, .refcnt = refcount_read(&m->tcf_refcnt) - ref, .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, }; struct net_device *dev; struct tcf_t t; u32 blockid; spin_lock_bh(&m->tcf_lock); opt.action = m->tcf_action; opt.eaction = m->tcfm_eaction; dev = tcf_mirred_dev_dereference(m); if (dev) opt.ifindex = dev->ifindex; if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) goto nla_put_failure; blockid = m->tcfm_blockid; if (blockid && nla_put_u32(skb, TCA_MIRRED_BLOCKID, blockid)) goto nla_put_failure; tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; spin_unlock_bh(&m->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&m->tcf_lock); nlmsg_trim(skb, b); return -1; } static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; ASSERT_RTNL(); if (event == NETDEV_UNREGISTER) { spin_lock(&mirred_list_lock); list_for_each_entry(m, &mirred_list, tcfm_list) { spin_lock_bh(&m->tcf_lock); if (tcf_mirred_dev_dereference(m) == dev) { netdev_put(dev, &m->tcfm_dev_tracker); /* Note : no rcu grace period necessary, as * net_device are already rcu protected. */ RCU_INIT_POINTER(m->tcfm_dev, NULL); } spin_unlock_bh(&m->tcf_lock); } spin_unlock(&mirred_list_lock); } return NOTIFY_DONE; } static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; static void tcf_mirred_dev_put(void *priv) { struct net_device *dev = priv; dev_put(dev); } static struct net_device * tcf_mirred_get_dev(const struct tc_action *a, tc_action_priv_destructor *destructor) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(m->tcfm_dev); if (dev) { dev_hold(dev); *destructor = tcf_mirred_dev_put; } rcu_read_unlock(); return dev; } static size_t tcf_mirred_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_mirred)); } static void tcf_offload_mirred_get_dev(struct flow_action_entry *entry, const struct tc_action *act) { entry->dev = act->ops->get_dev(act, &entry->destructor); if (!entry->dev) return; entry->destructor_priv = entry->dev; } static int tcf_mirred_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; if (is_tcf_mirred_egress_redirect(act)) { entry->id = FLOW_ACTION_REDIRECT; tcf_offload_mirred_get_dev(entry, act); } else if (is_tcf_mirred_egress_mirror(act)) { entry->id = FLOW_ACTION_MIRRED; tcf_offload_mirred_get_dev(entry, act); } else if (is_tcf_mirred_ingress_redirect(act)) { entry->id = FLOW_ACTION_REDIRECT_INGRESS; tcf_offload_mirred_get_dev(entry, act); } else if (is_tcf_mirred_ingress_mirror(act)) { entry->id = FLOW_ACTION_MIRRED_INGRESS; tcf_offload_mirred_get_dev(entry, act); } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported mirred offload"); return -EOPNOTSUPP; } *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; if (is_tcf_mirred_egress_redirect(act)) fl_action->id = FLOW_ACTION_REDIRECT; else if (is_tcf_mirred_egress_mirror(act)) fl_action->id = FLOW_ACTION_MIRRED; else if (is_tcf_mirred_ingress_redirect(act)) fl_action->id = FLOW_ACTION_REDIRECT_INGRESS; else if (is_tcf_mirred_ingress_mirror(act)) fl_action->id = FLOW_ACTION_MIRRED_INGRESS; else return -EOPNOTSUPP; } return 0; } static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .id = TCA_ID_MIRRED, .owner = THIS_MODULE, .act = tcf_mirred_act, .stats_update = tcf_stats_update, .dump = tcf_mirred_dump, .cleanup = tcf_mirred_release, .init = tcf_mirred_init, .get_fill_size = tcf_mirred_get_fill_size, .offload_act_setup = tcf_mirred_offload_act_setup, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, }; MODULE_ALIAS_NET_ACT("mirred"); static __net_init int mirred_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_mirred_ops.net_id); return tc_action_net_init(net, tn, &act_mirred_ops); } static void __net_exit mirred_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_mirred_ops.net_id); } static struct pernet_operations mirred_net_ops = { .init = mirred_init_net, .exit_batch = mirred_exit_net, .id = &act_mirred_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002)"); MODULE_DESCRIPTION("Device Mirror/redirect actions"); MODULE_LICENSE("GPL"); static int __init mirred_init_module(void) { int err = register_netdevice_notifier(&mirred_device_notifier); if (err) return err; pr_info("Mirror/redirect action on\n"); err = tcf_register_action(&act_mirred_ops, &mirred_net_ops); if (err) unregister_netdevice_notifier(&mirred_device_notifier); return err; } static void __exit mirred_cleanup_module(void) { tcf_unregister_action(&act_mirred_ops, &mirred_net_ops); unregister_netdevice_notifier(&mirred_device_notifier); } module_init(mirred_init_module); module_exit(mirred_cleanup_module); |
18 18 3 15 18 1 17 11 11 9 9 9 9 7 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct linkinfo_req_info { struct ethnl_req_info base; }; struct linkinfo_reply_data { struct ethnl_reply_data base; struct ethtool_link_ksettings ksettings; struct ethtool_link_settings *lsettings; }; #define LINKINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct linkinfo_reply_data, base) const struct nla_policy ethnl_linkinfo_get_policy[] = { [ETHTOOL_A_LINKINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int linkinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; data->lsettings = &data->ksettings.base; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = __ethtool_get_link_ksettings(dev, &data->ksettings); if (ret < 0) GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); ethnl_ops_complete(dev); return ret; } static int linkinfo_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u8)) /* LINKINFO_PORT */ + nla_total_size(sizeof(u8)) /* LINKINFO_PHYADDR */ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX */ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX_CTRL */ + nla_total_size(sizeof(u8)) /* LINKINFO_TRANSCEIVER */ + 0; } static int linkinfo_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); if (nla_put_u8(skb, ETHTOOL_A_LINKINFO_PORT, data->lsettings->port) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_PHYADDR, data->lsettings->phy_address) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX, data->lsettings->eth_tp_mdix) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, data->lsettings->eth_tp_mdix_ctrl) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TRANSCEIVER, data->lsettings->transceiver)) return -EMSGSIZE; return 0; } /* LINKINFO_SET */ const struct nla_policy ethnl_linkinfo_set_policy[] = { [ETHTOOL_A_LINKINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_U8 }, [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_U8 }, [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 }, }; static int ethnl_set_linkinfo_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; if (!ops->get_link_ksettings || !ops->set_link_ksettings) return -EOPNOTSUPP; return 1; } static int ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_link_ksettings ksettings = {}; struct ethtool_link_settings *lsettings; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; int ret; ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); return ret; } lsettings = &ksettings.base; ethnl_update_u8(&lsettings->port, tb[ETHTOOL_A_LINKINFO_PORT], &mod); ethnl_update_u8(&lsettings->phy_address, tb[ETHTOOL_A_LINKINFO_PHYADDR], &mod); ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl, tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod); if (!mod) return 0; ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "link settings update failed"); return ret; } return 1; } const struct ethnl_request_ops ethnl_linkinfo_request_ops = { .request_cmd = ETHTOOL_MSG_LINKINFO_GET, .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY, .hdr_attr = ETHTOOL_A_LINKINFO_HEADER, .req_info_size = sizeof(struct linkinfo_req_info), .reply_data_size = sizeof(struct linkinfo_reply_data), .prepare_data = linkinfo_prepare_data, .reply_size = linkinfo_reply_size, .fill_reply = linkinfo_fill_reply, .set_validate = ethnl_set_linkinfo_validate, .set = ethnl_set_linkinfo, .set_ntf_cmd = ETHTOOL_MSG_LINKINFO_NTF, }; |
6 6 4 8 8 7 7 1 4 6 6 5 4 4 4 4 15 7 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 | // SPDX-License-Identifier: GPL-2.0-only /* tunnel4.c: Generic IP tunnel transformer. * * Copyright (C) 2003 David S. Miller (davem@redhat.com) */ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/mpls.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/icmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; static struct xfrm_tunnel __rcu *tunnelmpls4_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) { return (family == AF_INET) ? &tunnel4_handlers : (family == AF_INET6) ? &tunnel64_handlers : &tunnelmpls4_handlers; } int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) { struct xfrm_tunnel __rcu **pprev; struct xfrm_tunnel *t; int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel4_mutex); for (pprev = fam_handlers(family); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel4_mutex))) != NULL; pprev = &t->next) { if (t->priority > priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&tunnel4_mutex); return ret; } EXPORT_SYMBOL(xfrm4_tunnel_register); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) { struct xfrm_tunnel __rcu **pprev; struct xfrm_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel4_mutex); for (pprev = fam_handlers(family); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel4_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } mutex_unlock(&tunnel4_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm4_tunnel_deregister); #define for_each_tunnel_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err) { struct xfrm_tunnel __rcu *head; struct xfrm_tunnel *handler; int ret; head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers; for_each_tunnel_rcu(head, handler) { if (handler->cb_handler) { ret = handler->cb_handler(skb, err); if (ret <= 0) return ret; } } return 0; } static const struct xfrm_input_afinfo tunnel4_input_afinfo = { .family = AF_INET, .is_ipip = true, .callback = tunnel4_rcv_cb, }; #endif #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #endif #if IS_ENABLED(CONFIG_MPLS) static int tunnelmpls4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct mpls_label))) goto drop; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #endif static int tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #endif #if IS_ENABLED(CONFIG_MPLS) static int tunnelmpls4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #endif static const struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, .no_policy = 1, }; #if IS_ENABLED(CONFIG_IPV6) static const struct net_protocol tunnel64_protocol = { .handler = tunnel64_rcv, .err_handler = tunnel64_err, .no_policy = 1, }; #endif #if IS_ENABLED(CONFIG_MPLS) static const struct net_protocol tunnelmpls4_protocol = { .handler = tunnelmpls4_rcv, .err_handler = tunnelmpls4_err, .no_policy = 1, }; #endif static int __init tunnel4_init(void) { if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) goto err; #if IS_ENABLED(CONFIG_IPV6) if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); goto err; } #endif #if IS_ENABLED(CONFIG_MPLS) if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); #if IS_ENABLED(CONFIG_IPV6) inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); #endif goto err; } #endif #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); #if IS_ENABLED(CONFIG_IPV6) inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); #endif #if IS_ENABLED(CONFIG_MPLS) inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS); #endif goto err; } #endif return 0; err: pr_err("%s: can't add protocol\n", __func__); return -EAGAIN; } static void __exit tunnel4_fini(void) { #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo)) pr_err("tunnel4 close: can't remove input afinfo\n"); #endif #if IS_ENABLED(CONFIG_MPLS) if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) pr_err("tunnelmpls4 close: can't remove protocol\n"); #endif #if IS_ENABLED(CONFIG_IPV6) if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) pr_err("tunnel64 close: can't remove protocol\n"); #endif if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP)) pr_err("tunnel4 close: can't remove protocol\n"); } module_init(tunnel4_init); module_exit(tunnel4_fini); MODULE_DESCRIPTION("IPv4 XFRM tunnel library"); MODULE_LICENSE("GPL"); |
18 398 39 399 1 106 5 2 2 101 37 114 106 230 114 60 98 2 38 2 2 24 4 6 11 14 12 4 29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct ptr_ring' datastructure. * * Author: * Michael S. Tsirkin <mst@redhat.com> * * Copyright (C) 2016 Red Hat, Inc. * * This is a limited-size FIFO maintaining pointers in FIFO order, with * one CPU producing entries and another consuming entries from a FIFO. * * This implementation tries to minimize cache-contention when there is a * single producer and a single consumer CPU. */ #ifndef _LINUX_PTR_RING_H #define _LINUX_PTR_RING_H 1 #ifdef __KERNEL__ #include <linux/spinlock.h> #include <linux/cache.h> #include <linux/types.h> #include <linux/compiler.h> #include <linux/slab.h> #include <linux/mm.h> #include <asm/errno.h> #endif struct ptr_ring { int producer ____cacheline_aligned_in_smp; spinlock_t producer_lock; int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */ int consumer_tail; /* next entry to invalidate */ spinlock_t consumer_lock; /* Shared consumer/producer data */ /* Read-only by both the producer and the consumer */ int size ____cacheline_aligned_in_smp; /* max entries in queue */ int batch; /* number of entries to consume in a batch */ void **queue; }; /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). * * NB: this is unlike __ptr_ring_empty in that callers must hold producer_lock: * see e.g. ptr_ring_full. */ static inline bool __ptr_ring_full(struct ptr_ring *r) { return r->queue[r->producer]; } static inline bool ptr_ring_full(struct ptr_ring *r) { bool ret; spin_lock(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock(&r->producer_lock); return ret; } static inline bool ptr_ring_full_irq(struct ptr_ring *r) { bool ret; spin_lock_irq(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock_irq(&r->producer_lock); return ret; } static inline bool ptr_ring_full_any(struct ptr_ring *r) { unsigned long flags; bool ret; spin_lock_irqsave(&r->producer_lock, flags); ret = __ptr_ring_full(r); spin_unlock_irqrestore(&r->producer_lock, flags); return ret; } static inline bool ptr_ring_full_bh(struct ptr_ring *r) { bool ret; spin_lock_bh(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock_bh(&r->producer_lock); return ret; } /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). Callers must hold producer_lock. * Callers are responsible for making sure pointer that is being queued * points to a valid data. */ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) { if (unlikely(!r->size) || r->queue[r->producer]) return -ENOSPC; /* Make sure the pointer we are storing points to a valid data. */ /* Pairs with the dependency ordering in __ptr_ring_consume. */ smp_wmb(); WRITE_ONCE(r->queue[r->producer++], ptr); if (unlikely(r->producer >= r->size)) r->producer = 0; return 0; } /* * Note: resize (below) nests producer lock within consumer lock, so if you * consume in interrupt or BH context, you must disable interrupts/BH when * calling this. */ static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr) { int ret; spin_lock(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock(&r->producer_lock); return ret; } static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr) { int ret; spin_lock_irq(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock_irq(&r->producer_lock); return ret; } static inline int ptr_ring_produce_any(struct ptr_ring *r, void *ptr) { unsigned long flags; int ret; spin_lock_irqsave(&r->producer_lock, flags); ret = __ptr_ring_produce(r, ptr); spin_unlock_irqrestore(&r->producer_lock, flags); return ret; } static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr) { int ret; spin_lock_bh(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock_bh(&r->producer_lock); return ret; } static inline void *__ptr_ring_peek(struct ptr_ring *r) { if (likely(r->size)) return READ_ONCE(r->queue[r->consumer_head]); return NULL; } /* * Test ring empty status without taking any locks. * * NB: This is only safe to call if ring is never resized. * * However, if some other CPU consumes ring entries at the same time, the value * returned is not guaranteed to be correct. * * In this case - to avoid incorrectly detecting the ring * as empty - the CPU consuming the ring entries is responsible * for either consuming all ring entries until the ring is empty, * or synchronizing with some other CPU and causing it to * re-test __ptr_ring_empty and/or consume the ring enteries * after the synchronization point. * * Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). */ static inline bool __ptr_ring_empty(struct ptr_ring *r) { if (likely(r->size)) return !r->queue[READ_ONCE(r->consumer_head)]; return true; } static inline bool ptr_ring_empty(struct ptr_ring *r) { bool ret; spin_lock(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock(&r->consumer_lock); return ret; } static inline bool ptr_ring_empty_irq(struct ptr_ring *r) { bool ret; spin_lock_irq(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock_irq(&r->consumer_lock); return ret; } static inline bool ptr_ring_empty_any(struct ptr_ring *r) { unsigned long flags; bool ret; spin_lock_irqsave(&r->consumer_lock, flags); ret = __ptr_ring_empty(r); spin_unlock_irqrestore(&r->consumer_lock, flags); return ret; } static inline bool ptr_ring_empty_bh(struct ptr_ring *r) { bool ret; spin_lock_bh(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock_bh(&r->consumer_lock); return ret; } /* Must only be called after __ptr_ring_peek returned !NULL */ static inline void __ptr_ring_discard_one(struct ptr_ring *r) { /* Fundamentally, what we want to do is update consumer * index and zero out the entry so producer can reuse it. * Doing it naively at each consume would be as simple as: * consumer = r->consumer; * r->queue[consumer++] = NULL; * if (unlikely(consumer >= r->size)) * consumer = 0; * r->consumer = consumer; * but that is suboptimal when the ring is full as producer is writing * out new entries in the same cache line. Defer these updates until a * batch of entries has been consumed. */ /* Note: we must keep consumer_head valid at all times for __ptr_ring_empty * to work correctly. */ int consumer_head = r->consumer_head; int head = consumer_head++; /* Once we have processed enough entries invalidate them in * the ring all at once so producer can reuse their space in the ring. * We also do this when we reach end of the ring - not mandatory * but helps keep the implementation simple. */ if (unlikely(consumer_head - r->consumer_tail >= r->batch || consumer_head >= r->size)) { /* Zero out entries in the reverse order: this way we touch the * cache line that producer might currently be reading the last; * producer won't make progress and touch other cache lines * besides the first one until we write out all entries. */ while (likely(head >= r->consumer_tail)) r->queue[head--] = NULL; r->consumer_tail = consumer_head; } if (unlikely(consumer_head >= r->size)) { consumer_head = 0; r->consumer_tail = 0; } /* matching READ_ONCE in __ptr_ring_empty for lockless tests */ WRITE_ONCE(r->consumer_head, consumer_head); } static inline void *__ptr_ring_consume(struct ptr_ring *r) { void *ptr; /* The READ_ONCE in __ptr_ring_peek guarantees that anyone * accessing data through the pointer is up to date. Pairs * with smp_wmb in __ptr_ring_produce. */ ptr = __ptr_ring_peek(r); if (ptr) __ptr_ring_discard_one(r); return ptr; } static inline int __ptr_ring_consume_batched(struct ptr_ring *r, void **array, int n) { void *ptr; int i; for (i = 0; i < n; i++) { ptr = __ptr_ring_consume(r); if (!ptr) break; array[i] = ptr; } return i; } /* * Note: resize (below) nests producer lock within consumer lock, so if you * call this in interrupt or BH context, you must disable interrupts/BH when * producing. */ static inline void *ptr_ring_consume(struct ptr_ring *r) { void *ptr; spin_lock(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock(&r->consumer_lock); return ptr; } static inline void *ptr_ring_consume_irq(struct ptr_ring *r) { void *ptr; spin_lock_irq(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock_irq(&r->consumer_lock); return ptr; } static inline void *ptr_ring_consume_any(struct ptr_ring *r) { unsigned long flags; void *ptr; spin_lock_irqsave(&r->consumer_lock, flags); ptr = __ptr_ring_consume(r); spin_unlock_irqrestore(&r->consumer_lock, flags); return ptr; } static inline void *ptr_ring_consume_bh(struct ptr_ring *r) { void *ptr; spin_lock_bh(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock_bh(&r->consumer_lock); return ptr; } static inline int ptr_ring_consume_batched(struct ptr_ring *r, void **array, int n) { int ret; spin_lock(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock(&r->consumer_lock); return ret; } static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r, void **array, int n) { int ret; spin_lock_irq(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_irq(&r->consumer_lock); return ret; } static inline int ptr_ring_consume_batched_any(struct ptr_ring *r, void **array, int n) { unsigned long flags; int ret; spin_lock_irqsave(&r->consumer_lock, flags); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_irqrestore(&r->consumer_lock, flags); return ret; } static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, void **array, int n) { int ret; spin_lock_bh(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_bh(&r->consumer_lock); return ret; } /* Cast to structure type and call a function without discarding from FIFO. * Function must return a value. * Callers must take consumer_lock. */ #define __PTR_RING_PEEK_CALL(r, f) ((f)(__ptr_ring_peek(r))) #define PTR_RING_PEEK_CALL(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_IRQ(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock_irq(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_irq(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_BH(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock_bh(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_bh(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_ANY(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ unsigned long __PTR_RING_PEEK_CALL_f;\ \ spin_lock_irqsave(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_irqrestore(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ __PTR_RING_PEEK_CALL_v; \ }) /* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See * documentation for vmalloc for which of them are legal. */ static inline void **__ptr_ring_init_queue_alloc_noprof(unsigned int size, gfp_t gfp) { if (size > KMALLOC_MAX_SIZE / sizeof(void *)) return NULL; return kvmalloc_array_noprof(size, sizeof(void *), gfp | __GFP_ZERO); } static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) { r->size = size; r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue)); /* We need to set batch at least to 1 to make logic * in __ptr_ring_discard_one work correctly. * Batching too much (because ring is small) would cause a lot of * burstiness. Needs tuning, for now disable batching. */ if (r->batch > r->size / 2 || !r->batch) r->batch = 1; } static inline int ptr_ring_init_noprof(struct ptr_ring *r, int size, gfp_t gfp) { r->queue = __ptr_ring_init_queue_alloc_noprof(size, gfp); if (!r->queue) return -ENOMEM; __ptr_ring_set_size(r, size); r->producer = r->consumer_head = r->consumer_tail = 0; spin_lock_init(&r->producer_lock); spin_lock_init(&r->consumer_lock); return 0; } #define ptr_ring_init(...) alloc_hooks(ptr_ring_init_noprof(__VA_ARGS__)) /* * Return entries into ring. Destroy entries that don't fit. * * Note: this is expected to be a rare slow path operation. * * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n, void (*destroy)(void *)) { unsigned long flags; int head; spin_lock_irqsave(&r->consumer_lock, flags); spin_lock(&r->producer_lock); if (!r->size) goto done; /* * Clean out buffered entries (for simplicity). This way following code * can test entries for NULL and if not assume they are valid. */ head = r->consumer_head - 1; while (likely(head >= r->consumer_tail)) r->queue[head--] = NULL; r->consumer_tail = r->consumer_head; /* * Go over entries in batch, start moving head back and copy entries. * Stop when we run into previously unconsumed entries. */ while (n) { head = r->consumer_head - 1; if (head < 0) head = r->size - 1; if (r->queue[head]) { /* This batch entry will have to be destroyed. */ goto done; } r->queue[head] = batch[--n]; r->consumer_tail = head; /* matching READ_ONCE in __ptr_ring_empty for lockless tests */ WRITE_ONCE(r->consumer_head, head); } done: /* Destroy all entries left in the batch. */ while (n) destroy(batch[--n]); spin_unlock(&r->producer_lock); spin_unlock_irqrestore(&r->consumer_lock, flags); } static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, int size, gfp_t gfp, void (*destroy)(void *)) { int producer = 0; void **old; void *ptr; while ((ptr = __ptr_ring_consume(r))) if (producer < size) queue[producer++] = ptr; else if (destroy) destroy(ptr); if (producer >= size) producer = 0; __ptr_ring_set_size(r, size); r->producer = producer; r->consumer_head = 0; r->consumer_tail = 0; old = r->queue; r->queue = queue; return old; } /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp, void (*destroy)(void *)) { unsigned long flags; void **queue = __ptr_ring_init_queue_alloc_noprof(size, gfp); void **old; if (!queue) return -ENOMEM; spin_lock_irqsave(&(r)->consumer_lock, flags); spin_lock(&(r)->producer_lock); old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy); spin_unlock(&(r)->producer_lock); spin_unlock_irqrestore(&(r)->consumer_lock, flags); kvfree(old); return 0; } #define ptr_ring_resize(...) alloc_hooks(ptr_ring_resize_noprof(__VA_ARGS__)) /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings, unsigned int nrings, int size, gfp_t gfp, void (*destroy)(void *)) { unsigned long flags; void ***queues; int i; queues = kmalloc_array_noprof(nrings, sizeof(*queues), gfp); if (!queues) goto noqueues; for (i = 0; i < nrings; ++i) { queues[i] = __ptr_ring_init_queue_alloc_noprof(size, gfp); if (!queues[i]) goto nomem; } for (i = 0; i < nrings; ++i) { spin_lock_irqsave(&(rings[i])->consumer_lock, flags); spin_lock(&(rings[i])->producer_lock); queues[i] = __ptr_ring_swap_queue(rings[i], queues[i], size, gfp, destroy); spin_unlock(&(rings[i])->producer_lock); spin_unlock_irqrestore(&(rings[i])->consumer_lock, flags); } for (i = 0; i < nrings; ++i) kvfree(queues[i]); kfree(queues); return 0; nomem: while (--i >= 0) kvfree(queues[i]); kfree(queues); noqueues: return -ENOMEM; } #define ptr_ring_resize_multiple(...) \ alloc_hooks(ptr_ring_resize_multiple_noprof(__VA_ARGS__)) static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) { void *ptr; if (destroy) while ((ptr = ptr_ring_consume(r))) destroy(ptr); kvfree(r->queue); } #endif /* _LINUX_PTR_RING_H */ |
5 5 4 5 5 5 5 5 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. */ #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/fs.h> #include <linux/writeback.h> #include <linux/swap.h> #include <linux/gfs2_ondisk.h> #include <linux/backing-dev.h> #include <linux/uio.h> #include <trace/events/writeback.h> #include <linux/sched/signal.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" #include "glock.h" #include "inode.h" #include "log.h" #include "meta_io.h" #include "quota.h" #include "trans.h" #include "rgrp.h" #include "super.h" #include "util.h" #include "glops.h" #include "aops.h" void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, size_t from, size_t len) { struct buffer_head *head = folio_buffers(folio); unsigned int bsize = head->b_size; struct buffer_head *bh; size_t to = from + len; size_t start, end; for (bh = head, start = 0; bh != head || !start; bh = bh->b_this_page, start = end) { end = start + bsize; if (end <= from) continue; if (start >= to) break; set_buffer_uptodate(bh); gfs2_trans_add_data(ip->i_gl, bh); } } /** * gfs2_get_block_noalloc - Fills in a buffer head with details about a block * @inode: The inode * @lblock: The block number to look up * @bh_result: The buffer head to return the result in * @create: Non-zero if we may add block to the file * * Returns: errno */ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, struct buffer_head *bh_result, int create) { int error; error = gfs2_block_map(inode, lblock, bh_result, 0); if (error) return error; if (!buffer_mapped(bh_result)) return -ENODATA; return 0; } /** * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_folio * @folio: The folio to write * @wbc: The writeback control * * This is the same as calling block_write_full_folio, but it also * writes pages outside of i_size */ static int gfs2_write_jdata_folio(struct folio *folio, struct writeback_control *wbc) { struct inode * const inode = folio->mapping->host; loff_t i_size = i_size_read(inode); /* * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ if (folio_pos(folio) < i_size && i_size < folio_pos(folio) + folio_size(folio)) folio_zero_segment(folio, offset_in_folio(folio, i_size), folio_size(folio)); return __block_write_full_folio(inode, folio, gfs2_get_block_noalloc, wbc); } /** * __gfs2_jdata_write_folio - The core of jdata writepage * @folio: The folio to write * @wbc: The writeback control * * Implements the core of write back. If a transaction is required then * the checked flag will have been set and the transaction will have * already been started before this is called. */ static int __gfs2_jdata_write_folio(struct folio *folio, struct writeback_control *wbc) { struct inode *inode = folio->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); if (folio_test_checked(folio)) { folio_clear_checked(folio); if (!folio_buffers(folio)) { create_empty_buffers(folio, inode->i_sb->s_blocksize, BIT(BH_Dirty)|BIT(BH_Uptodate)); } gfs2_trans_add_databufs(ip, folio, 0, folio_size(folio)); } return gfs2_write_jdata_folio(folio, wbc); } /** * gfs2_writepages - Write a bunch of dirty pages back to disk * @mapping: The mapping to write * @wbc: Write-back control * * Used for both ordered and writeback modes. */ static int gfs2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct iomap_writepage_ctx wpc = { }; int ret; /* * Even if we didn't write enough pages here, we might still be holding * dirty pages in the ail. We forcibly flush the ail because we don't * want balance_dirty_pages() to loop indefinitely trying to write out * pages held in the ail that it can't find. */ ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops); if (ret == 0 && wbc->nr_to_write > 0) set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); return ret; } /** * gfs2_write_jdata_batch - Write back a folio batch's worth of folios * @mapping: The mapping * @wbc: The writeback control * @fbatch: The batch of folios * @done_index: Page index * * Returns: non-zero if loop should terminate, zero otherwise */ static int gfs2_write_jdata_batch(struct address_space *mapping, struct writeback_control *wbc, struct folio_batch *fbatch, pgoff_t *done_index) { struct inode *inode = mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned nrblocks; int i; int ret; size_t size = 0; int nr_folios = folio_batch_count(fbatch); for (i = 0; i < nr_folios; i++) size += folio_size(fbatch->folios[i]); nrblocks = size >> inode->i_blkbits; ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); if (ret < 0) return ret; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch->folios[i]; *done_index = folio->index; folio_lock(folio); if (unlikely(folio->mapping != mapping)) { continue_unlock: folio_unlock(folio); continue; } if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } if (folio_test_writeback(folio)) { if (wbc->sync_mode != WB_SYNC_NONE) folio_wait_writeback(folio); else goto continue_unlock; } BUG_ON(folio_test_writeback(folio)); if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; trace_wbc_writepage(wbc, inode_to_bdi(inode)); ret = __gfs2_jdata_write_folio(folio, wbc); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { folio_unlock(folio); ret = 0; } else { /* * done_index is set past this page, * so media errors will not choke * background writeout for the entire * file. This has consequences for * range_cyclic semantics (ie. it may * not be suitable for data integrity * writeout). */ *done_index = folio_next_index(folio); ret = 1; break; } } /* * We stop writing back only if we are not doing * integrity sync. In case of integrity sync we have to * keep going until we have written all the pages * we tagged for writeback prior to entering this loop. */ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { ret = 1; break; } } gfs2_trans_end(sdp); return ret; } /** * gfs2_write_cache_jdata - Like write_cache_pages but different * @mapping: The mapping to write * @wbc: The writeback control * * The reason that we use our own function here is that we need to * start transactions before we grab page locks. This allows us * to get the ordering right. */ static int gfs2_write_cache_jdata(struct address_space *mapping, struct writeback_control *wbc) { int ret = 0; int done = 0; struct folio_batch fbatch; int nr_folios; pgoff_t writeback_index; pgoff_t index; pgoff_t end; pgoff_t done_index; int cycled; int range_whole = 0; xa_mark_t tag; folio_batch_init(&fbatch); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; if (index == 0) cycled = 1; else cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); if (nr_folios == 0) break; ret = gfs2_write_jdata_batch(mapping, wbc, &fbatch, &done_index); if (ret) done = 1; if (ret > 0) ret = 0; folio_batch_release(&fbatch); cond_resched(); } if (!cycled && !done) { /* * range_cyclic: * We hit the last page and there is more work to be done: wrap * back to the start of the file */ cycled = 1; index = 0; end = writeback_index - 1; goto retry; } if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; return ret; } /** * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk * @mapping: The mapping to write * @wbc: The writeback control * */ static int gfs2_jdata_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_sbd *sdp = GFS2_SB(mapping->host); int ret; ret = gfs2_write_cache_jdata(mapping, wbc); if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) { gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_JDATA_WPAGES); ret = gfs2_write_cache_jdata(mapping, wbc); } return ret; } /** * stuffed_read_folio - Fill in a Linux folio with stuffed file data * @ip: the inode * @folio: the folio * * Returns: errno */ static int stuffed_read_folio(struct gfs2_inode *ip, struct folio *folio) { struct buffer_head *dibh = NULL; size_t dsize = i_size_read(&ip->i_inode); void *from = NULL; int error = 0; /* * Due to the order of unstuffing files and ->fault(), we can be * asked for a zero folio in the case of a stuffed file being extended, * so we need to supply one here. It doesn't happen often. */ if (unlikely(folio->index)) { dsize = 0; } else { error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out; from = dibh->b_data + sizeof(struct gfs2_dinode); } folio_fill_tail(folio, 0, from, dsize); brelse(dibh); out: folio_end_read(folio, error == 0); return error; } /** * gfs2_read_folio - read a folio from a file * @file: The file to read * @folio: The folio in the file */ static int gfs2_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); int error; if (!gfs2_is_jdata(ip) || (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) { error = iomap_read_folio(folio, &gfs2_iomap_ops); } else if (gfs2_is_stuffed(ip)) { error = stuffed_read_folio(ip, folio); } else { error = mpage_read_folio(folio, gfs2_block_map); } if (gfs2_withdrawing_or_withdrawn(sdp)) return -EIO; return error; } /** * gfs2_internal_read - read an internal file * @ip: The gfs2 inode * @buf: The buffer to fill * @pos: The file position * @size: The amount to read * */ ssize_t gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, size_t size) { struct address_space *mapping = ip->i_inode.i_mapping; unsigned long index = *pos >> PAGE_SHIFT; size_t copied = 0; do { size_t offset, chunk; struct folio *folio; folio = read_cache_folio(mapping, index, gfs2_read_folio, NULL); if (IS_ERR(folio)) { if (PTR_ERR(folio) == -EINTR) continue; return PTR_ERR(folio); } offset = *pos + copied - folio_pos(folio); chunk = min(size - copied, folio_size(folio) - offset); memcpy_from_folio(buf + copied, folio, offset, chunk); index = folio_next_index(folio); folio_put(folio); copied += chunk; } while(copied < size); (*pos) += size; return size; } /** * gfs2_readahead - Read a bunch of pages at once * @rac: Read-ahead control structure * * Some notes: * 1. This is only for readahead, so we can simply ignore any things * which are slightly inconvenient (such as locking conflicts between * the page lock and the glock) and return having done no I/O. Its * obviously not something we'd want to do on too regular a basis. * Any I/O we ignore at this time will be done via readpage later. * 2. We don't handle stuffed files here we let readpage do the honours. * 3. mpage_readahead() does most of the heavy lifting in the common case. * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. */ static void gfs2_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); if (gfs2_is_stuffed(ip)) ; else if (gfs2_is_jdata(ip)) mpage_readahead(rac, gfs2_block_map); else iomap_readahead(rac, &gfs2_iomap_ops); } /** * adjust_fs_space - Adjusts the free space available due to gfs2_grow * @inode: the rindex inode */ void adjust_fs_space(struct inode *inode) { struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct buffer_head *m_bh; u64 fs_total, new_free; if (gfs2_trans_begin(sdp, 2 * RES_STATFS, 0) != 0) return; /* Total up the file system space, according to the latest rindex. */ fs_total = gfs2_ri_total(sdp); if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0) goto out; spin_lock(&sdp->sd_statfs_spin); gfs2_statfs_change_in(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); if (fs_total > (m_sc->sc_total + l_sc->sc_total)) new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); else new_free = 0; spin_unlock(&sdp->sd_statfs_spin); fs_warn(sdp, "File system extended by %llu blocks.\n", (unsigned long long)new_free); gfs2_statfs_change(sdp, new_free, new_free, 0); update_statfs(sdp, m_bh); brelse(m_bh); out: sdp->sd_rindex_uptodate = 0; gfs2_trans_end(sdp); } static bool jdata_dirty_folio(struct address_space *mapping, struct folio *folio) { if (current->journal_info) folio_set_checked(folio); return block_dirty_folio(mapping, folio); } /** * gfs2_bmap - Block map function * @mapping: Address space info * @lblock: The block to map * * Returns: The disk address for the block or 0 on hole or error */ static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock) { struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_holder i_gh; sector_t dblock = 0; int error; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return 0; if (!gfs2_is_stuffed(ip)) dblock = iomap_bmap(mapping, lblock, &gfs2_iomap_ops); gfs2_glock_dq_uninit(&i_gh); return dblock; } static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) { struct gfs2_bufdata *bd; lock_buffer(bh); gfs2_log_lock(sdp); clear_buffer_dirty(bh); bd = bh->b_private; if (bd) { if (!list_empty(&bd->bd_list) && !buffer_pinned(bh)) list_del_init(&bd->bd_list); else { spin_lock(&sdp->sd_ail_lock); gfs2_remove_from_journal(bh, REMOVE_JDATA); spin_unlock(&sdp->sd_ail_lock); } } bh->b_bdev = NULL; clear_buffer_mapped(bh); clear_buffer_req(bh); clear_buffer_new(bh); gfs2_log_unlock(sdp); unlock_buffer(bh); } static void gfs2_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct gfs2_sbd *sdp = GFS2_SB(folio->mapping->host); size_t stop = offset + length; int partial_page = (offset || length < folio_size(folio)); struct buffer_head *bh, *head; unsigned long pos = 0; BUG_ON(!folio_test_locked(folio)); if (!partial_page) folio_clear_checked(folio); head = folio_buffers(folio); if (!head) goto out; bh = head; do { if (pos + bh->b_size > stop) return; if (offset <= pos) gfs2_discard(sdp, bh); pos += bh->b_size; bh = bh->b_this_page; } while (bh != head); out: if (!partial_page) filemap_release_folio(folio, 0); } /** * gfs2_release_folio - free the metadata associated with a folio * @folio: the folio that's being released * @gfp_mask: passed from Linux VFS, ignored by us * * Calls try_to_free_buffers() to free the buffers and put the folio if the * buffers can be released. * * Returns: true if the folio was put or else false */ bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask) { struct address_space *mapping = folio->mapping; struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct buffer_head *bh, *head; struct gfs2_bufdata *bd; head = folio_buffers(folio); if (!head) return false; /* * mm accommodates an old ext3 case where clean folios might * not have had the dirty bit cleared. Thus, it can send actual * dirty folios to ->release_folio() via shrink_active_list(). * * As a workaround, we skip folios that contain dirty buffers * below. Once ->release_folio isn't called on dirty folios * anymore, we can warn on dirty buffers like we used to here * again. */ gfs2_log_lock(sdp); bh = head; do { if (atomic_read(&bh->b_count)) goto cannot_release; bd = bh->b_private; if (bd && bd->bd_tr) goto cannot_release; if (buffer_dirty(bh) || WARN_ON(buffer_pinned(bh))) goto cannot_release; bh = bh->b_this_page; } while (bh != head); bh = head; do { bd = bh->b_private; if (bd) { gfs2_assert_warn(sdp, bd->bd_bh == bh); bd->bd_bh = NULL; bh->b_private = NULL; /* * The bd may still be queued as a revoke, in which * case we must not dequeue nor free it. */ if (!bd->bd_blkno && !list_empty(&bd->bd_list)) list_del_init(&bd->bd_list); if (list_empty(&bd->bd_list)) kmem_cache_free(gfs2_bufdata_cachep, bd); } bh = bh->b_this_page; } while (bh != head); gfs2_log_unlock(sdp); return try_to_free_buffers(folio); cannot_release: gfs2_log_unlock(sdp); return false; } static const struct address_space_operations gfs2_aops = { .writepages = gfs2_writepages, .read_folio = gfs2_read_folio, .readahead = gfs2_readahead, .dirty_folio = iomap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .bmap = gfs2_bmap, .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, }; static const struct address_space_operations gfs2_jdata_aops = { .writepages = gfs2_jdata_writepages, .read_folio = gfs2_read_folio, .readahead = gfs2_readahead, .dirty_folio = jdata_dirty_folio, .bmap = gfs2_bmap, .migrate_folio = buffer_migrate_folio, .invalidate_folio = gfs2_invalidate_folio, .release_folio = gfs2_release_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, }; void gfs2_set_aops(struct inode *inode) { if (gfs2_is_jdata(GFS2_I(inode))) inode->i_mapping->a_ops = &gfs2_jdata_aops; else inode->i_mapping->a_ops = &gfs2_aops; } |
1 4 5 2 2 4 4 6 9 6 5 2 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 | // SPDX-License-Identifier: GPL-2.0+ /* * Copyright (C) 2003-2008 Takahiro Hirofuchi */ #include <linux/kthread.h> #include <linux/slab.h> #include "usbip_common.h" #include "vhci.h" /* get URB from transmitted urb queue. caller must hold vdev->priv_lock */ struct urb *pickup_urb_and_free_priv(struct vhci_device *vdev, __u32 seqnum) { struct vhci_priv *priv, *tmp; struct urb *urb = NULL; int status; list_for_each_entry_safe(priv, tmp, &vdev->priv_rx, list) { if (priv->seqnum != seqnum) continue; urb = priv->urb; status = urb->status; usbip_dbg_vhci_rx("find urb seqnum %u\n", seqnum); switch (status) { case -ENOENT: fallthrough; case -ECONNRESET: dev_dbg(&urb->dev->dev, "urb seq# %u was unlinked %ssynchronously\n", seqnum, status == -ENOENT ? "" : "a"); break; case -EINPROGRESS: /* no info output */ break; default: dev_dbg(&urb->dev->dev, "urb seq# %u may be in a error, status %d\n", seqnum, status); } list_del(&priv->list); kfree(priv); urb->hcpriv = NULL; break; } return urb; } static void vhci_recv_ret_submit(struct vhci_device *vdev, struct usbip_header *pdu) { struct vhci_hcd *vhci_hcd = vdev_to_vhci_hcd(vdev); struct vhci *vhci = vhci_hcd->vhci; struct usbip_device *ud = &vdev->ud; struct urb *urb; unsigned long flags; spin_lock_irqsave(&vdev->priv_lock, flags); urb = pickup_urb_and_free_priv(vdev, pdu->base.seqnum); spin_unlock_irqrestore(&vdev->priv_lock, flags); if (!urb) { pr_err("cannot find a urb of seqnum %u max seqnum %d\n", pdu->base.seqnum, atomic_read(&vhci_hcd->seqnum)); usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); return; } /* unpack the pdu to a urb */ usbip_pack_pdu(pdu, urb, USBIP_RET_SUBMIT, 0); /* recv transfer buffer */ if (usbip_recv_xbuff(ud, urb) < 0) { urb->status = -EPROTO; goto error; } /* recv iso_packet_descriptor */ if (usbip_recv_iso(ud, urb) < 0) { urb->status = -EPROTO; goto error; } /* restore the padding in iso packets */ usbip_pad_iso(ud, urb); error: if (usbip_dbg_flag_vhci_rx) usbip_dump_urb(urb); if (urb->num_sgs) urb->transfer_flags &= ~URB_DMA_MAP_SG; usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum); spin_lock_irqsave(&vhci->lock, flags); usb_hcd_unlink_urb_from_ep(vhci_hcd_to_hcd(vhci_hcd), urb); spin_unlock_irqrestore(&vhci->lock, flags); usb_hcd_giveback_urb(vhci_hcd_to_hcd(vhci_hcd), urb, urb->status); usbip_dbg_vhci_rx("Leave\n"); } static struct vhci_unlink *dequeue_pending_unlink(struct vhci_device *vdev, struct usbip_header *pdu) { struct vhci_unlink *unlink, *tmp; unsigned long flags; spin_lock_irqsave(&vdev->priv_lock, flags); list_for_each_entry_safe(unlink, tmp, &vdev->unlink_rx, list) { pr_info("unlink->seqnum %lu\n", unlink->seqnum); if (unlink->seqnum == pdu->base.seqnum) { usbip_dbg_vhci_rx("found pending unlink, %lu\n", unlink->seqnum); list_del(&unlink->list); spin_unlock_irqrestore(&vdev->priv_lock, flags); return unlink; } } spin_unlock_irqrestore(&vdev->priv_lock, flags); return NULL; } static void vhci_recv_ret_unlink(struct vhci_device *vdev, struct usbip_header *pdu) { struct vhci_hcd *vhci_hcd = vdev_to_vhci_hcd(vdev); struct vhci *vhci = vhci_hcd->vhci; struct vhci_unlink *unlink; struct urb *urb; unsigned long flags; usbip_dump_header(pdu); unlink = dequeue_pending_unlink(vdev, pdu); if (!unlink) { pr_info("cannot find the pending unlink %u\n", pdu->base.seqnum); return; } spin_lock_irqsave(&vdev->priv_lock, flags); urb = pickup_urb_and_free_priv(vdev, unlink->unlink_seqnum); spin_unlock_irqrestore(&vdev->priv_lock, flags); if (!urb) { /* * I get the result of a unlink request. But, it seems that I * already received the result of its submit result and gave * back the URB. */ pr_info("the urb (seqnum %d) was already given back\n", pdu->base.seqnum); } else { usbip_dbg_vhci_rx("now giveback urb %d\n", pdu->base.seqnum); /* If unlink is successful, status is -ECONNRESET */ urb->status = pdu->u.ret_unlink.status; pr_info("urb->status %d\n", urb->status); spin_lock_irqsave(&vhci->lock, flags); usb_hcd_unlink_urb_from_ep(vhci_hcd_to_hcd(vhci_hcd), urb); spin_unlock_irqrestore(&vhci->lock, flags); usb_hcd_giveback_urb(vhci_hcd_to_hcd(vhci_hcd), urb, urb->status); } kfree(unlink); } static int vhci_priv_tx_empty(struct vhci_device *vdev) { int empty = 0; unsigned long flags; spin_lock_irqsave(&vdev->priv_lock, flags); empty = list_empty(&vdev->priv_rx); spin_unlock_irqrestore(&vdev->priv_lock, flags); return empty; } /* recv a pdu */ static void vhci_rx_pdu(struct usbip_device *ud) { int ret; struct usbip_header pdu; struct vhci_device *vdev = container_of(ud, struct vhci_device, ud); usbip_dbg_vhci_rx("Enter\n"); memset(&pdu, 0, sizeof(pdu)); /* receive a pdu header */ ret = usbip_recv(ud->tcp_socket, &pdu, sizeof(pdu)); if (ret < 0) { if (ret == -ECONNRESET) pr_info("connection reset by peer\n"); else if (ret == -EAGAIN) { /* ignore if connection was idle */ if (vhci_priv_tx_empty(vdev)) return; pr_info("connection timed out with pending urbs\n"); } else if (ret != -ERESTARTSYS) pr_info("xmit failed %d\n", ret); usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); return; } if (ret == 0) { pr_info("connection closed"); usbip_event_add(ud, VDEV_EVENT_DOWN); return; } if (ret != sizeof(pdu)) { pr_err("received pdu size is %d, should be %d\n", ret, (unsigned int)sizeof(pdu)); usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); return; } usbip_header_correct_endian(&pdu, 0); if (usbip_dbg_flag_vhci_rx) usbip_dump_header(&pdu); switch (pdu.base.command) { case USBIP_RET_SUBMIT: vhci_recv_ret_submit(vdev, &pdu); break; case USBIP_RET_UNLINK: vhci_recv_ret_unlink(vdev, &pdu); break; default: /* NOT REACHED */ pr_err("unknown pdu %u\n", pdu.base.command); usbip_dump_header(&pdu); usbip_event_add(ud, VDEV_EVENT_ERROR_TCP); break; } } int vhci_rx_loop(void *data) { struct usbip_device *ud = data; while (!kthread_should_stop()) { if (usbip_event_happened(ud)) break; usbip_kcov_remote_start(ud); vhci_rx_pdu(ud); usbip_kcov_remote_stop(); } return 0; } |
25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_BTREE_H__ #define __XFS_BTREE_H__ struct xfs_buf; struct xfs_inode; struct xfs_mount; struct xfs_trans; struct xfs_ifork; struct xfs_perag; /* * Generic key, ptr and record wrapper structures. * * These are disk format structures, and are converted where necessary * by the btree specific code that needs to interpret them. */ union xfs_btree_ptr { __be32 s; /* short form ptr */ __be64 l; /* long form ptr */ }; /* * The in-core btree key. Overlapping btrees actually store two keys * per pointer, so we reserve enough memory to hold both. The __*bigkey * items should never be accessed directly. */ union xfs_btree_key { struct xfs_bmbt_key bmbt; xfs_bmdr_key_t bmbr; /* bmbt root block */ xfs_alloc_key_t alloc; struct xfs_inobt_key inobt; struct xfs_rmap_key rmap; struct xfs_rmap_key __rmap_bigkey[2]; struct xfs_refcount_key refc; }; union xfs_btree_rec { struct xfs_bmbt_rec bmbt; xfs_bmdr_rec_t bmbr; /* bmbt root block */ struct xfs_alloc_rec alloc; struct xfs_inobt_rec inobt; struct xfs_rmap_rec rmap; struct xfs_refcount_rec refc; }; /* * This nonsense is to make -wlint happy. */ #define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi) #define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) #define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) struct xfs_btree_ops; uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops); /* * For logging record fields. */ #define XFS_BB_MAGIC (1u << 0) #define XFS_BB_LEVEL (1u << 1) #define XFS_BB_NUMRECS (1u << 2) #define XFS_BB_LEFTSIB (1u << 3) #define XFS_BB_RIGHTSIB (1u << 4) #define XFS_BB_BLKNO (1u << 5) #define XFS_BB_LSN (1u << 6) #define XFS_BB_UUID (1u << 7) #define XFS_BB_OWNER (1u << 8) #define XFS_BB_NUM_BITS 5 #define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1) #define XFS_BB_NUM_BITS_CRC 9 #define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1) /* * Generic stats interface */ #define XFS_BTREE_STATS_INC(cur, stat) \ XFS_STATS_INC_OFF((cur)->bc_mp, \ (cur)->bc_ops->statoff + __XBTS_ ## stat) #define XFS_BTREE_STATS_ADD(cur, stat, val) \ XFS_STATS_ADD_OFF((cur)->bc_mp, \ (cur)->bc_ops->statoff + __XBTS_ ## stat, val) enum xbtree_key_contig { XBTREE_KEY_GAP = 0, XBTREE_KEY_CONTIGUOUS, XBTREE_KEY_OVERLAP, }; /* * Decide if these two numeric btree key fields are contiguous, overlapping, * or if there's a gap between them. @x should be the field from the high * key and @y should be the field from the low key. */ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) { x++; if (x < y) return XBTREE_KEY_GAP; if (x == y) return XBTREE_KEY_CONTIGUOUS; return XBTREE_KEY_OVERLAP; } #define XFS_BTREE_LONG_PTR_LEN (sizeof(__be64)) #define XFS_BTREE_SHORT_PTR_LEN (sizeof(__be32)) enum xfs_btree_type { XFS_BTREE_TYPE_AG, XFS_BTREE_TYPE_INODE, XFS_BTREE_TYPE_MEM, }; struct xfs_btree_ops { const char *name; /* Type of btree - AG-rooted or inode-rooted */ enum xfs_btree_type type; /* XFS_BTGEO_* flags that determine the geometry of the btree */ unsigned int geom_flags; /* size of the key, pointer, and record structures */ size_t key_len; size_t ptr_len; size_t rec_len; /* LRU refcount to set on each btree buffer created */ unsigned int lru_refs; /* offset of btree stats array */ unsigned int statoff; /* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */ unsigned int sick_mask; /* cursor operations */ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); void (*update_cursor)(struct xfs_btree_cur *src, struct xfs_btree_cur *dst); /* update btree root pointer */ void (*set_root)(struct xfs_btree_cur *cur, const union xfs_btree_ptr *nptr, int level_change); /* block allocation / freeing */ int (*alloc_block)(struct xfs_btree_cur *cur, const union xfs_btree_ptr *start_bno, union xfs_btree_ptr *new_bno, int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); /* records in block/level */ int (*get_minrecs)(struct xfs_btree_cur *cur, int level); int (*get_maxrecs)(struct xfs_btree_cur *cur, int level); /* records on disk. Matter for the root in inode case. */ int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level); /* init values of btree structures */ void (*init_key_from_rec)(union xfs_btree_key *key, const union xfs_btree_rec *rec); void (*init_rec_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec); void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); void (*init_high_key_from_rec)(union xfs_btree_key *key, const union xfs_btree_rec *rec); /* difference between key value and cursor value */ int64_t (*key_diff)(struct xfs_btree_cur *cur, const union xfs_btree_key *key); /* * Difference between key2 and key1 -- positive if key1 > key2, * negative if key1 < key2, and zero if equal. If the @mask parameter * is non NULL, each key field to be used in the comparison must * contain a nonzero value. */ int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask); const struct xfs_buf_ops *buf_ops; /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2); /* check that r1 is lower than r2 */ int (*recs_inorder)(struct xfs_btree_cur *cur, const union xfs_btree_rec *r1, const union xfs_btree_rec *r2); /* * Are these two btree keys immediately adjacent? * * Given two btree keys @key1 and @key2, decide if it is impossible for * there to be a third btree key K satisfying the relationship * @key1 < K < @key2. To determine if two btree records are * immediately adjacent, @key1 should be the high key of the first * record and @key2 should be the low key of the second record. * If the @mask parameter is non NULL, each key field to be used in the * comparison must contain a nonzero value. */ enum xbtree_key_contig (*keys_contiguous)(struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask); }; /* btree geometry flags */ #define XFS_BTGEO_OVERLAPPING (1U << 0) /* overlapping intervals */ union xfs_btree_irec { struct xfs_alloc_rec_incore a; struct xfs_bmbt_irec b; struct xfs_inobt_rec_incore i; struct xfs_rmap_irec r; struct xfs_refcount_irec rc; }; struct xfs_btree_level { /* buffer pointer */ struct xfs_buf *bp; /* key/record number */ uint16_t ptr; /* readahead info */ #define XFS_BTCUR_LEFTRA (1 << 0) /* left sibling has been read-ahead */ #define XFS_BTCUR_RIGHTRA (1 << 1) /* right sibling has been read-ahead */ uint16_t ra; }; /* * Btree cursor structure. * This collects all information needed by the btree code in one place. */ struct xfs_btree_cur { struct xfs_trans *bc_tp; /* transaction we're in, if any */ struct xfs_mount *bc_mp; /* file system mount struct */ const struct xfs_btree_ops *bc_ops; struct kmem_cache *bc_cache; /* cursor cache */ unsigned int bc_flags; /* btree features - below */ union xfs_btree_irec bc_rec; /* current insert/search record value */ uint8_t bc_nlevels; /* number of levels in the tree */ uint8_t bc_maxlevels; /* maximum levels for this btree type */ struct xfs_group *bc_group; /* per-type information */ union { struct { struct xfs_inode *ip; short forksize; char whichfork; struct xbtree_ifakeroot *ifake; /* for staging cursor */ } bc_ino; struct { struct xfs_buf *agbp; struct xbtree_afakeroot *afake; /* for staging cursor */ } bc_ag; struct { struct xfbtree *xfbtree; } bc_mem; }; /* per-format private data */ union { struct { int allocated; } bc_bmap; /* bmapbt */ struct { unsigned int nr_ops; /* # record updates */ unsigned int shape_changes; /* # of extent splits */ } bc_refc; /* refcountbt */ }; /* Must be at the end of the struct! */ struct xfs_btree_level bc_levels[]; }; /* * Compute the size of a btree cursor that can handle a btree of a given * height. The bc_levels array handles node and leaf blocks, so its size * is exactly nlevels. */ static inline size_t xfs_btree_cur_sizeof(unsigned int nlevels) { return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels); } /* cursor state flags */ /* * The root of this btree is a fakeroot structure so that we can stage a btree * rebuild without leaving it accessible via primary metadata. The ops struct * is dynamically allocated and must be freed when the cursor is deleted. */ #define XFS_BTREE_STAGING (1U << 0) /* We are converting a delalloc reservation (only for bmbt btrees) */ #define XFS_BTREE_BMBT_WASDEL (1U << 1) /* For extent swap, ignore owner check in verifier (only for bmbt btrees) */ #define XFS_BTREE_BMBT_INVALID_OWNER (1U << 2) /* Cursor is active (only for allocbt btrees) */ #define XFS_BTREE_ALLOCBT_ACTIVE (1U << 3) #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 /* * Convert from buffer to btree block header. */ #define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) xfs_failaddr_t __xfs_btree_check_block(struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp); int __xfs_btree_check_ptr(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int index, int level); /* * Check that block header is ok. */ int xfs_btree_check_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_btree_block *block, /* generic btree block pointer */ int level, /* level of the btree block */ struct xfs_buf *bp); /* buffer containing block, if any */ /* * Delete the btree cursor. */ void xfs_btree_del_cursor( struct xfs_btree_cur *cur, /* btree cursor */ int error); /* del because of error */ /* * Duplicate the btree cursor. * Allocate a new one, copy the record, re-get the buffers. */ int /* error */ xfs_btree_dup_cursor( struct xfs_btree_cur *cur, /* input cursor */ struct xfs_btree_cur **ncur);/* output cursor */ /* * Compute first and last byte offsets for the fields given. * Interprets the offsets table, which contains struct field offsets. */ void xfs_btree_offsets( uint32_t fields, /* bitmask of fields */ const short *offsets,/* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last); /* output: last byte offset */ /* * Initialise a new btree block header */ void xfs_btree_init_buf(struct xfs_mount *mp, struct xfs_buf *bp, const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, __u64 owner); void xfs_btree_init_block(struct xfs_mount *mp, struct xfs_btree_block *buf, const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, __u64 owner); /* * Common btree core entry points. */ int xfs_btree_increment(struct xfs_btree_cur *, int, int *); int xfs_btree_decrement(struct xfs_btree_cur *, int, int *); int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); int xfs_btree_insert(struct xfs_btree_cur *, int *); int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner, struct list_head *buffer_list); /* * btree block CRC helpers */ void xfs_btree_fsblock_calc_crc(struct xfs_buf *); bool xfs_btree_fsblock_verify_crc(struct xfs_buf *); void xfs_btree_agblock_calc_crc(struct xfs_buf *); bool xfs_btree_agblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. */ void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t); void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* * Helpers. */ static inline int xfs_btree_get_numrecs(const struct xfs_btree_block *block) { return be16_to_cpu(block->bb_numrecs); } static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, uint16_t numrecs) { block->bb_numrecs = cpu_to_be16(numrecs); } static inline int xfs_btree_get_level(const struct xfs_btree_block *block) { return be16_to_cpu(block->bb_level); } /* * Min and max functions for extlen, agblock, fileoff, and filblks types. */ #define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b)) #define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b)) #define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b)) #define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b)) #define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b)) #define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b)) #define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) xfs_failaddr_t xfs_btree_agblock_v5hdr_verify(struct xfs_buf *bp); xfs_failaddr_t xfs_btree_agblock_verify(struct xfs_buf *bp, unsigned int max_recs); xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp, uint64_t owner); xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp, unsigned int max_recs); xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp, unsigned int max_recs); unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits, unsigned long long records); unsigned long long xfs_btree_calc_size(const unsigned int *limits, unsigned long long records); unsigned int xfs_btree_space_to_height(const unsigned int *limits, unsigned long long blocks); /* * Return codes for the query range iterator function are 0 to continue * iterating, and non-zero to stop iterating. Any non-zero value will be * passed up to the _query_range caller. The special value -ECANCELED can be * used to stop iteration, because _query_range never generates that error * code on its own. */ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, const union xfs_btree_rec *rec, void *priv); int xfs_btree_query_range(struct xfs_btree_cur *cur, const union xfs_btree_irec *low_rec, const union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv); int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, void *priv); typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, void *data); /* Visit record blocks. */ #define XFS_BTREE_VISIT_RECORDS (1 << 0) /* Visit leaf blocks. */ #define XFS_BTREE_VISIT_LEAVES (1 << 1) /* Visit all blocks. */ #define XFS_BTREE_VISIT_ALL (XFS_BTREE_VISIT_RECORDS | \ XFS_BTREE_VISIT_LEAVES) int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data); int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_filblks_t *blocks); union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); union xfs_btree_key *xfs_btree_key_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level, const union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, int level, struct xfs_buf **bpp); bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr); int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur, const union xfs_btree_ptr *a, const union xfs_btree_ptr *b); void xfs_btree_get_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_ptr *ptr, int lr); void xfs_btree_get_keys(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key); union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); typedef bool (*xfs_btree_key_gap_fn)(struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2); int xfs_btree_has_records(struct xfs_btree_cur *cur, const union xfs_btree_irec *low, const union xfs_btree_irec *high, const union xfs_btree_key *mask, enum xbtree_recpacking *outcome); bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); /* Key comparison helpers */ static inline bool xfs_btree_keycmp_lt( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) < 0; } static inline bool xfs_btree_keycmp_gt( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) > 0; } static inline bool xfs_btree_keycmp_eq( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) == 0; } static inline bool xfs_btree_keycmp_le( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return !xfs_btree_keycmp_gt(cur, key1, key2); } static inline bool xfs_btree_keycmp_ge( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return !xfs_btree_keycmp_lt(cur, key1, key2); } static inline bool xfs_btree_keycmp_ne( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2) { return !xfs_btree_keycmp_eq(cur, key1, key2); } /* Masked key comparison helpers */ static inline bool xfs_btree_masked_keycmp_lt( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask) { return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) < 0; } static inline bool xfs_btree_masked_keycmp_gt( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask) { return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) > 0; } static inline bool xfs_btree_masked_keycmp_ge( struct xfs_btree_cur *cur, const union xfs_btree_key *key1, const union xfs_btree_key *key2, const union xfs_btree_key *mask) { return !xfs_btree_masked_keycmp_lt(cur, key1, key2, mask); } /* Does this cursor point to the last block in the given level? */ static inline bool xfs_btree_islastblock( struct xfs_btree_cur *cur, int level) { struct xfs_btree_block *block; struct xfs_buf *bp; block = xfs_btree_get_block(cur, level, &bp); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, struct xfs_buf **bpp); int xfs_btree_read_buf_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int flags, struct xfs_btree_block **block, struct xfs_buf **bpp); void xfs_btree_set_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, int lr); void xfs_btree_init_block_cur(struct xfs_btree_cur *cur, struct xfs_buf *bp, int level, int numrecs); void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, union xfs_btree_ptr *dst_ptr, const union xfs_btree_ptr *src_ptr, int numptrs); void xfs_btree_copy_keys(struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, const union xfs_btree_key *src_key, int numkeys); void xfs_btree_init_ptr_from_cur(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); static inline struct xfs_btree_cur * xfs_btree_alloc_cursor( struct xfs_mount *mp, struct xfs_trans *tp, const struct xfs_btree_ops *ops, uint8_t maxlevels, struct kmem_cache *cache) { struct xfs_btree_cur *cur; ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN || ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN); /* BMBT allocations can come through from non-transactional context. */ cur = kmem_cache_zalloc(cache, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); cur->bc_ops = ops; cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_maxlevels = maxlevels; cur->bc_cache = cache; return cur; } int __init xfs_btree_init_cur_caches(void); void xfs_btree_destroy_cur_caches(void); int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur); /* Does this level of the cursor point to the inode root (and not a block)? */ static inline bool xfs_btree_at_iroot( const struct xfs_btree_cur *cur, int level) { return cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == cur->bc_nlevels - 1; } #endif /* __XFS_BTREE_H__ */ |
498 4 499 500 408 425 1742 410 1740 189 56 126 41 553 272 382 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 | // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <linux/atomic.h> #include <linux/inetdevice.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_nat_masquerade.h> struct masq_dev_work { struct work_struct work; struct net *net; netns_tracker ns_tracker; union nf_inet_addr addr; int ifindex; int (*iter)(struct nf_conn *i, void *data); }; #define MAX_MASQ_WORKER_COUNT 16 static DEFINE_MUTEX(masq_mutex); static unsigned int masq_refcnt __read_mostly; static atomic_t masq_worker_count __read_mostly; unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, const struct nf_nat_range2 *range, const struct net_device *out) { struct nf_conn *ct; struct nf_conn_nat *nat; enum ip_conntrack_info ctinfo; struct nf_nat_range2 newrange; const struct rtable *rt; __be32 newsrc, nh; WARN_ON(hooknum != NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. */ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) return NF_ACCEPT; rt = skb_rtable(skb); nh = rt_nexthop(rt, ip_hdr(skb)->daddr); newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); if (!newsrc) { pr_info("%s ate my IP address\n", out->name); return NF_DROP; } nat = nf_ct_nat_ext_add(ct); if (nat) nat->masq_index = out->ifindex; /* Transfer from original range. */ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; /* Hand modified range to generic setup. */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); static void iterate_cleanup_work(struct work_struct *work) { struct nf_ct_iter_data iter_data = {}; struct masq_dev_work *w; w = container_of(work, struct masq_dev_work, work); iter_data.net = w->net; iter_data.data = (void *)w; nf_ct_iterate_cleanup_net(w->iter, &iter_data); put_net_track(w->net, &w->ns_tracker); kfree(w); atomic_dec(&masq_worker_count); module_put(THIS_MODULE); } /* Iterate conntrack table in the background and remove conntrack entries * that use the device/address being removed. * * In case too many work items have been queued already or memory allocation * fails iteration is skipped, conntrack entries will time out eventually. */ static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, int ifindex, int (*iter)(struct nf_conn *i, void *data), gfp_t gfp_flags) { struct masq_dev_work *w; if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT) return; net = maybe_get_net(net); if (!net) return; if (!try_module_get(THIS_MODULE)) goto err_module; w = kzalloc(sizeof(*w), gfp_flags); if (w) { /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ atomic_inc(&masq_worker_count); INIT_WORK(&w->work, iterate_cleanup_work); w->ifindex = ifindex; w->net = net; netns_tracker_alloc(net, &w->ns_tracker, gfp_flags); w->iter = iter; if (addr) w->addr = *addr; schedule_work(&w->work); return; } module_put(THIS_MODULE); err_module: put_net(net); } static int device_cmp(struct nf_conn *i, void *arg) { const struct nf_conn_nat *nat = nfct_nat(i); const struct masq_dev_work *w = arg; if (!nat) return 0; return nat->masq_index == w->ifindex; } static int masq_device_event(struct notifier_block *this, unsigned long event, void *ptr) { const struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (event == NETDEV_DOWN) { /* Device was downed. Search entire table for * conntracks which were associated with that device, * and forget them. */ nf_nat_masq_schedule(net, NULL, dev->ifindex, device_cmp, GFP_KERNEL); } return NOTIFY_DONE; } static int inet_cmp(struct nf_conn *ct, void *ptr) { struct nf_conntrack_tuple *tuple; struct masq_dev_work *w = ptr; if (!device_cmp(ct, ptr)) return 0; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); } static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { const struct in_ifaddr *ifa = ptr; const struct in_device *idev; const struct net_device *dev; union nf_inet_addr addr; if (event != NETDEV_DOWN) return NOTIFY_DONE; /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have * no work to do. Otherwise this is an individual address removal * and we have to perform the flush. */ idev = ifa->ifa_dev; if (idev->dead) return NOTIFY_DONE; memset(&addr, 0, sizeof(addr)); addr.ip = ifa->ifa_address; dev = idev->dev; nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex, inet_cmp, GFP_KERNEL); return NOTIFY_DONE; } static struct notifier_block masq_dev_notifier = { .notifier_call = masq_device_event, }; static struct notifier_block masq_inet_notifier = { .notifier_call = masq_inet_event, }; #if IS_ENABLED(CONFIG_IPV6) static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr) { #ifdef CONFIG_IPV6_MODULE const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); if (!v6_ops) return -EHOSTUNREACH; return v6_ops->dev_get_saddr(net, dev, daddr, srcprefs, saddr); #else return ipv6_dev_get_saddr(net, dev, daddr, srcprefs, saddr); #endif } unsigned int nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out) { enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; struct in6_addr src; struct nf_conn *ct; struct nf_nat_range2 newrange; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); if (nat_ipv6_dev_get_saddr(nf_ct_net(ct), out, &ipv6_hdr(skb)->daddr, 0, &src) < 0) return NF_DROP; nat = nf_ct_nat_ext_add(ct); if (nat) nat->masq_index = out->ifindex; newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.in6 = src; newrange.max_addr.in6 = src; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). * * Defer it to the system workqueue. * * As we can have 'a lot' of inet_events (depending on amount of ipv6 * addresses being deleted), we also need to limit work item queue. */ static int masq_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; union nf_inet_addr addr; if (event != NETDEV_DOWN) return NOTIFY_DONE; dev = ifa->idev->dev; memset(&addr, 0, sizeof(addr)); addr.in6 = ifa->addr; nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp, GFP_ATOMIC); return NOTIFY_DONE; } static struct notifier_block masq_inet6_notifier = { .notifier_call = masq_inet6_event, }; static int nf_nat_masquerade_ipv6_register_notifier(void) { return register_inet6addr_notifier(&masq_inet6_notifier); } #else static inline int nf_nat_masquerade_ipv6_register_notifier(void) { return 0; } #endif int nf_nat_masquerade_inet_register_notifiers(void) { int ret = 0; mutex_lock(&masq_mutex); if (WARN_ON_ONCE(masq_refcnt == UINT_MAX)) { ret = -EOVERFLOW; goto out_unlock; } /* check if the notifier was already set */ if (++masq_refcnt > 1) goto out_unlock; /* Register for device down reports */ ret = register_netdevice_notifier(&masq_dev_notifier); if (ret) goto err_dec; /* Register IP address change reports */ ret = register_inetaddr_notifier(&masq_inet_notifier); if (ret) goto err_unregister; ret = nf_nat_masquerade_ipv6_register_notifier(); if (ret) goto err_unreg_inet; mutex_unlock(&masq_mutex); return ret; err_unreg_inet: unregister_inetaddr_notifier(&masq_inet_notifier); err_unregister: unregister_netdevice_notifier(&masq_dev_notifier); err_dec: masq_refcnt--; out_unlock: mutex_unlock(&masq_mutex); return ret; } EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_register_notifiers); void nf_nat_masquerade_inet_unregister_notifiers(void) { mutex_lock(&masq_mutex); /* check if the notifiers still have clients */ if (--masq_refcnt > 0) goto out_unlock; unregister_netdevice_notifier(&masq_dev_notifier); unregister_inetaddr_notifier(&masq_inet_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&masq_inet6_notifier); #endif out_unlock: mutex_unlock(&masq_mutex); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_unregister_notifiers); |
308 65 9 59 52 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Internal procfs definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/binfmts.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> #include <linux/mm.h> struct ctl_table_header; struct mempolicy; /* * This is not completely implemented yet. The idea is to * create an in-memory tree (like the actual /proc filesystem * tree) of these proc_dir_entries, so that we can dynamically * add new files to /proc. * * parent/subdir are used for the directory structure (every /proc file has a * parent, but "subdir" is empty for all non-directory entries). * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { /* * number of callers into module in progress; * negative -> it's going away RSN */ atomic_t in_use; refcount_t refcnt; struct list_head pde_openers; /* who did ->open, but not ->release */ /* protects ->pde_openers and all struct pde_opener instances */ spinlock_t pde_unload_lock; struct completion *pde_unload_completion; const struct inode_operations *proc_iops; union { const struct proc_ops *proc_ops; const struct file_operations *proc_dir_ops; }; const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); }; proc_write_t write; void *data; unsigned int state_size; unsigned int low_ino; nlink_t nlink; kuid_t uid; kgid_t gid; loff_t size; struct proc_dir_entry *parent; struct rb_root subdir; struct rb_node subdir_node; char *name; umode_t mode; u8 flags; u8 namelen; char inline_name[]; } __randomize_layout; #define SIZEOF_PDE ( \ sizeof(struct proc_dir_entry) < 128 ? 128 : \ sizeof(struct proc_dir_entry) < 192 ? 192 : \ sizeof(struct proc_dir_entry) < 256 ? 256 : \ sizeof(struct proc_dir_entry) < 512 ? 512 : \ 0) #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) static inline bool pde_is_permanent(const struct proc_dir_entry *pde) { return pde->flags & PROC_ENTRY_PERMANENT; } static inline void pde_make_permanent(struct proc_dir_entry *pde) { pde->flags |= PROC_ENTRY_PERMANENT; } extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); int lsmid; }; struct proc_inode { struct pid *pid; unsigned int fd; union proc_op op; struct proc_dir_entry *pde; struct ctl_table_header *sysctl; const struct ctl_table *sysctl_entry; struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; } __randomize_layout; /* * General functions */ static inline struct proc_inode *PROC_I(const struct inode *inode) { return container_of(inode, struct proc_inode, vfs_inode); } static inline struct proc_dir_entry *PDE(const struct inode *inode) { return PROC_I(inode)->pde; } static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; } static inline struct task_struct *get_proc_task(const struct inode *inode) { return get_pid_task(proc_pid(inode), PIDTYPE_PID); } void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid); unsigned name_to_int(const struct qstr *qstr); /* * Offset of the first process in the /proc root directory.. */ #define FIRST_PROCESS_ENTRY 256 /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 13 /** * folio_precise_page_mapcount() - Number of mappings of this folio page. * @folio: The folio. * @page: The page. * * The number of present user page table entries that reference this page * as tracked via the RMAP: either referenced directly (PTE) or as part of * a larger area that covers this page (e.g., PMD). * * Use this function only for the calculation of existing statistics * (USS, PSS, mapcount_max) and for debugging purposes (/proc/kpagecount). * * Do not add new users. * * Returns: The number of mappings of this folio page. 0 for * folios that are not mapped to user space or are not tracked via the RMAP * (e.g., shared zeropage). */ static inline int folio_precise_page_mapcount(struct folio *folio, struct page *page) { int mapcount = atomic_read(&page->_mapcount) + 1; if (page_mapcount_is_type(mapcount)) mapcount = 0; if (folio_test_large(folio)) mapcount += folio_entire_mapcount(folio); return mapcount; } /* * array.c */ extern const struct file_operations proc_tid_children_operations; extern void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape); extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_status(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); /* * base.c */ extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); struct dentry *proc_pid_lookup(struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ typedef struct dentry *instantiate_t(struct dentry *, struct task_struct *, const void *); bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, instantiate_t, struct task_struct *, const void *); /* * generic.c */ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, struct proc_dir_entry **parent, void *data); struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, struct proc_dir_entry *dp); extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); static inline void pde_get(struct proc_dir_entry *pde) { refcount_inc(&pde->refcnt); } extern void pde_put(struct proc_dir_entry *); static inline bool is_empty_pde(const struct proc_dir_entry *pde) { return S_ISDIR(pde->mode) && !pde->proc_iops; } extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *); /* * inode.c */ struct pde_opener { struct list_head lh; struct file *file; bool closing; struct completion *c; } __randomize_layout; extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; extern const struct super_operations proc_sops; void proc_init_kmemcache(void); void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern void proc_entry_rundown(struct proc_dir_entry *); /* * proc_namespaces.c */ extern const struct inode_operations proc_ns_dir_inode_operations; extern const struct file_operations proc_ns_dir_operations; /* * proc_net.c */ extern const struct file_operations proc_net_operations; extern const struct inode_operations proc_net_inode_operations; #ifdef CONFIG_NET extern int proc_net_init(void); #else static inline int proc_net_init(void) { return 0; } #endif /* * proc_self.c */ extern int proc_setup_self(struct super_block *); /* * proc_thread_self.c */ extern int proc_setup_thread_self(struct super_block *); extern void proc_thread_self_init(void); /* * proc_sysctl.c */ #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); extern void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head); #else static inline void proc_sys_init(void) { } static inline void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { } #endif /* * proc_tty.c */ #ifdef CONFIG_TTY extern void proc_tty_init(void); #else static inline void proc_tty_init(void) {} #endif /* * root.c */ extern struct proc_dir_entry proc_root; extern void proc_self_init(void); /* * task_[no]mmu.c */ struct mem_size_stats; struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; struct vma_iterator iter; #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif } __randomize_layout; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); extern const struct file_operations proc_pid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, unsigned long *, unsigned long *, unsigned long *, unsigned long *); extern void task_mem(struct seq_file *, struct mm_struct *); extern const struct dentry_operations proc_net_dentry_ops; static inline void pde_force_lookup(struct proc_dir_entry *pde) { /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ pde->proc_dops = &proc_net_dentry_ops; } /* * Add a new procfs dentry that can't serve as a mountpoint. That should * encompass anything that is ephemeral and can just disappear while the * process is still around. */ static inline struct dentry *proc_splice_unmountable(struct inode *inode, struct dentry *dentry, const struct dentry_operations *d_ops) { d_set_d_op(dentry, d_ops); dont_mount(dentry); return d_splice_alias(inode, dentry); } |
32 32 28 27 25 25 25 25 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 * Phillip Lougher <phillip@squashfs.org.uk> * * zlib_wrapper.c */ #include <linux/mutex.h> #include <linux/bio.h> #include <linux/slab.h> #include <linux/zlib.h> #include <linux/vmalloc.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" #include "page_actor.h" static void *zlib_init(struct squashfs_sb_info *dummy, void *buff) { z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); if (stream == NULL) goto failed; stream->workspace = vmalloc(zlib_inflate_workspacesize()); if (stream->workspace == NULL) goto failed; return stream; failed: ERROR("Failed to allocate zlib workspace\n"); kfree(stream); return ERR_PTR(-ENOMEM); } static void zlib_free(void *strm) { z_stream *stream = strm; if (stream) vfree(stream->workspace); kfree(stream); } static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct bvec_iter_all iter_all = {}; struct bio_vec *bvec = bvec_init_iter_all(&iter_all); int zlib_init = 0, error = 0; z_stream *stream = strm; stream->avail_out = PAGE_SIZE; stream->next_out = squashfs_first_page(output); stream->avail_in = 0; if (IS_ERR(stream->next_out)) { error = PTR_ERR(stream->next_out); goto finish; } for (;;) { int zlib_err; if (stream->avail_in == 0) { const void *data; int avail; if (!bio_next_segment(bio, &iter_all)) { /* Z_STREAM_END must be reached. */ error = -EIO; break; } avail = min(length, ((int)bvec->bv_len) - offset); data = bvec_virt(bvec); length -= avail; stream->next_in = data + offset; stream->avail_in = avail; offset = 0; } if (stream->avail_out == 0) { stream->next_out = squashfs_next_page(output); if (IS_ERR(stream->next_out)) { error = PTR_ERR(stream->next_out); break; } else if (stream->next_out != NULL) stream->avail_out = PAGE_SIZE; } if (!zlib_init) { zlib_err = zlib_inflateInit(stream); if (zlib_err != Z_OK) { error = -EIO; break; } zlib_init = 1; } zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH); if (zlib_err == Z_STREAM_END) break; if (zlib_err != Z_OK) { error = -EIO; break; } } finish: squashfs_finish_page(output); if (!error) if (zlib_inflateEnd(stream) != Z_OK) error = -EIO; return error ? error : stream->total_out; } const struct squashfs_decompressor squashfs_zlib_comp_ops = { .init = zlib_init, .free = zlib_free, .decompress = zlib_uncompress, .id = ZLIB_COMPRESSION, .name = "zlib", .alloc_buffer = 1, .supported = 1 }; |
1 1 24 24 24 1 1 25 25 24 3 25 24 25 3 25 19 20 20 20 3 3 3 25 25 25 24 25 24 3 3 3 3 3 25 25 24 24 20 3 3 3 3 3 2 3 24 25 25 24 25 24 24 25 25 25 25 4 4 2 24 3 3 3 25 18 17 17 3 3 20 20 17 18 20 20 3 18 3 3 3 2 3 2 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 25 24 25 1 1 1 1 19 19 12 12 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_btree.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_alloc.h" #include "xfs_log.h" #include "xfs_btree_staging.h" #include "xfs_ag.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" #include "xfs_bmap_btree.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_health.h" #include "xfs_buf_mem.h" #include "xfs_btree_mem.h" /* * Btree magic numbers. */ uint32_t xfs_btree_magic( struct xfs_mount *mp, const struct xfs_btree_ops *ops) { int idx = xfs_has_crc(mp) ? 1 : 0; __be32 magic = ops->buf_ops->magic[idx]; /* Ensure we asked for crc for crc-only magics. */ ASSERT(magic != 0); return be32_to_cpu(magic); } /* * These sibling pointer checks are optimised for null sibling pointers. This * happens a lot, and we don't need to byte swap at runtime if the sibling * pointer is NULL. * * These are explicitly marked at inline because the cost of calling them as * functions instead of inlining them is about 36 bytes extra code per call site * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these * two sibling check functions reduces the compiled code size by over 300 * bytes. */ static inline xfs_failaddr_t xfs_btree_check_fsblock_siblings( struct xfs_mount *mp, xfs_fsblock_t fsb, __be64 dsibling) { xfs_fsblock_t sibling; if (dsibling == cpu_to_be64(NULLFSBLOCK)) return NULL; sibling = be64_to_cpu(dsibling); if (sibling == fsb) return __this_address; if (!xfs_verify_fsbno(mp, sibling)) return __this_address; return NULL; } static inline xfs_failaddr_t xfs_btree_check_memblock_siblings( struct xfs_buftarg *btp, xfbno_t bno, __be64 dsibling) { xfbno_t sibling; if (dsibling == cpu_to_be64(NULLFSBLOCK)) return NULL; sibling = be64_to_cpu(dsibling); if (sibling == bno) return __this_address; if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling))) return __this_address; return NULL; } static inline xfs_failaddr_t xfs_btree_check_agblock_siblings( struct xfs_perag *pag, xfs_agblock_t agbno, __be32 dsibling) { xfs_agblock_t sibling; if (dsibling == cpu_to_be32(NULLAGBLOCK)) return NULL; sibling = be32_to_cpu(dsibling); if (sibling == agbno) return __this_address; if (!xfs_verify_agbno(pag, sibling)) return __this_address; return NULL; } static xfs_failaddr_t __xfs_btree_check_lblock_hdr( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; if (xfs_has_crc(mp)) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.l.bb_blkno != cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) return __this_address; if (block->bb_u.l.bb_pad != cpu_to_be32(0)) return __this_address; } if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) return __this_address; if (be16_to_cpu(block->bb_level) != level) return __this_address; if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; return NULL; } /* * Check a long btree block header. Return the address of the failing check, * or NULL if everything is ok. */ static xfs_failaddr_t __xfs_btree_check_fsblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; xfs_failaddr_t fa; xfs_fsblock_t fsb; fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); if (fa) return fa; /* * For inode-rooted btrees, the root block sits in the inode fork. In * that case bp is NULL, and the block must not have any siblings. */ if (!bp) { if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK)) return __this_address; if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK)) return __this_address; return NULL; } fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_leftsib); if (!fa) fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_rightsib); return fa; } /* * Check an in-memory btree block header. Return the address of the failing * check, or NULL if everything is ok. */ static xfs_failaddr_t __xfs_btree_check_memblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; xfs_failaddr_t fa; xfbno_t bno; fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); if (fa) return fa; bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); fa = xfs_btree_check_memblock_siblings(btp, bno, block->bb_u.l.bb_leftsib); if (!fa) fa = xfs_btree_check_memblock_siblings(btp, bno, block->bb_u.l.bb_rightsib); return fa; } /* * Check a short btree block header. Return the address of the failing check, * or NULL if everything is ok. */ static xfs_failaddr_t __xfs_btree_check_agblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = to_perag(cur->bc_group); xfs_failaddr_t fa; xfs_agblock_t agbno; if (xfs_has_crc(mp)) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; } if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) return __this_address; if (be16_to_cpu(block->bb_level) != level) return __this_address; if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_agblock_siblings(pag, agbno, block->bb_u.s.bb_leftsib); if (!fa) fa = xfs_btree_check_agblock_siblings(pag, agbno, block->bb_u.s.bb_rightsib); return fa; } /* * Internal btree block check. * * Return NULL if the block is ok or the address of the failed check otherwise. */ xfs_failaddr_t __xfs_btree_check_block( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_MEM: return __xfs_btree_check_memblock(cur, block, level, bp); case XFS_BTREE_TYPE_AG: return __xfs_btree_check_agblock(cur, block, level, bp); case XFS_BTREE_TYPE_INODE: return __xfs_btree_check_fsblock(cur, block, level, bp); default: ASSERT(0); return __this_address; } } static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) { if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) return XFS_ERRTAG_BTREE_CHECK_SBLOCK; return XFS_ERRTAG_BTREE_CHECK_LBLOCK; } /* * Debug routine: check that block header is ok. */ int xfs_btree_check_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_btree_block *block, /* generic btree block pointer */ int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer containing block, if any */ { struct xfs_mount *mp = cur->bc_mp; xfs_failaddr_t fa; fa = __xfs_btree_check_block(cur, block, level, bp); if (XFS_IS_CORRUPT(mp, fa != NULL) || XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return 0; } int __xfs_btree_check_ptr( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int index, int level) { if (level <= 0) return -EFSCORRUPTED; switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_MEM: if (!xfbtree_verify_bno(cur->bc_mem.xfbtree, be64_to_cpu((&ptr->l)[index]))) return -EFSCORRUPTED; break; case XFS_BTREE_TYPE_INODE: if (!xfs_verify_fsbno(cur->bc_mp, be64_to_cpu((&ptr->l)[index]))) return -EFSCORRUPTED; break; case XFS_BTREE_TYPE_AG: if (!xfs_verify_agbno(to_perag(cur->bc_group), be32_to_cpu((&ptr->s)[index]))) return -EFSCORRUPTED; break; } return 0; } /* * Check that a given (indexed) btree pointer at a certain level of a * btree is valid and doesn't point past where it should. */ static int xfs_btree_check_ptr( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int index, int level) { int error; error = __xfs_btree_check_ptr(cur, ptr, index, level); if (error) { switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_MEM: xfs_err(cur->bc_mp, "In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.", cur->bc_ops->name, cur->bc_flags, level, index, __this_address); break; case XFS_BTREE_TYPE_INODE: xfs_err(cur->bc_mp, "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", cur->bc_ino.ip->i_ino, cur->bc_ino.whichfork, cur->bc_ops->name, level, index); break; case XFS_BTREE_TYPE_AG: xfs_err(cur->bc_mp, "AG %u: Corrupt %sbt pointer at level %d index %d.", cur->bc_group->xg_gno, cur->bc_ops->name, level, index); break; } xfs_btree_mark_sick(cur); } return error; } #ifdef DEBUG # define xfs_btree_debug_check_ptr xfs_btree_check_ptr #else # define xfs_btree_debug_check_ptr(...) (0) #endif /* * Calculate CRC on the whole btree block and stuff it into the * long-form btree header. * * Prior to calculting the CRC, pull the LSN out of the buffer log item and put * it into the buffer so recovery knows what the last modification was that made * it to disk. */ void xfs_btree_fsblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buf_log_item *bip = bp->b_log_item; if (!xfs_has_crc(bp->b_mount)) return; if (bip) block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); } bool xfs_btree_fsblock_verify_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); } return true; } /* * Calculate CRC on the whole btree block and stuff it into the * short-form btree header. * * Prior to calculting the CRC, pull the LSN out of the buffer log item and put * it into the buffer so recovery knows what the last modification was that made * it to disk. */ void xfs_btree_agblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buf_log_item *bip = bp->b_log_item; if (!xfs_has_crc(bp->b_mount)) return; if (bip) block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); } bool xfs_btree_agblock_verify_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_mount *mp = bp->b_mount; if (xfs_has_crc(mp)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); } return true; } static int xfs_btree_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { int error; trace_xfs_btree_free_block(cur, bp); /* * Don't allow block freeing for a staging cursor, because staging * cursors do not support regular btree modifications. */ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { ASSERT(0); return -EFSCORRUPTED; } error = cur->bc_ops->free_block(cur, bp); if (!error) { xfs_trans_binval(cur->bc_tp, bp); XFS_BTREE_STATS_INC(cur, free); } return error; } /* * Delete the btree cursor. */ void xfs_btree_del_cursor( struct xfs_btree_cur *cur, /* btree cursor */ int error) /* del because of error */ { int i; /* btree level */ /* * Clear the buffer pointers and release the buffers. If we're doing * this because of an error, inspect all of the entries in the bc_bufs * array for buffers to be unlocked. This is because some of the btree * code works from level n down to 0, and if we get an error along the * way we won't have initialized all the entries down to 0. */ for (i = 0; i < cur->bc_nlevels; i++) { if (cur->bc_levels[i].bp) xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); else if (!error) break; } /* * If we are doing a BMBT update, the number of unaccounted blocks * allocated during this cursor life time should be zero. If it's not * zero, then we should be shut down or on our way to shutdown due to * cancelling a dirty transaction on error. */ ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || xfs_is_shutdown(cur->bc_mp) || error != 0); if (cur->bc_group) xfs_group_put(cur->bc_group); kmem_cache_free(cur->bc_cache, cur); } /* Return the buffer target for this btree's buffer. */ static inline struct xfs_buftarg * xfs_btree_buftarg( struct xfs_btree_cur *cur) { if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) return cur->bc_mem.xfbtree->target; return cur->bc_mp->m_ddev_targp; } /* Return the block size (in units of 512b sectors) for this btree. */ static inline unsigned int xfs_btree_bbsize( struct xfs_btree_cur *cur) { if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) return XFBNO_BBSIZE; return cur->bc_mp->m_bsize; } /* * Duplicate the btree cursor. * Allocate a new one, copy the record, re-get the buffers. */ int /* error */ xfs_btree_dup_cursor( struct xfs_btree_cur *cur, /* input cursor */ struct xfs_btree_cur **ncur) /* output cursor */ { struct xfs_mount *mp = cur->bc_mp; struct xfs_trans *tp = cur->bc_tp; struct xfs_buf *bp; struct xfs_btree_cur *new; int error; int i; /* * Don't allow staging cursors to be duplicated because they're supposed * to be kept private to a single thread. */ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { ASSERT(0); return -EFSCORRUPTED; } /* * Allocate a new cursor like the old one. */ new = cur->bc_ops->dup_cursor(cur); /* * Copy the record currently in the cursor. */ new->bc_rec = cur->bc_rec; /* * For each level current, re-get the buffer and copy the ptr value. */ for (i = 0; i < new->bc_nlevels; i++) { new->bc_levels[i].ptr = cur->bc_levels[i].ptr; new->bc_levels[i].ra = cur->bc_levels[i].ra; bp = cur->bc_levels[i].bp; if (bp) { error = xfs_trans_read_buf(mp, tp, xfs_btree_buftarg(cur), xfs_buf_daddr(bp), xfs_btree_bbsize(cur), 0, &bp, cur->bc_ops->buf_ops); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(new); if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; return error; } } new->bc_levels[i].bp = bp; } *ncur = new; return 0; } /* * XFS btree block layout and addressing: * * There are two types of blocks in the btree: leaf and non-leaf blocks. * * The leaf record start with a header then followed by records containing * the values. A non-leaf block also starts with the same header, and * then first contains lookup keys followed by an equal number of pointers * to the btree blocks at the previous level. * * +--------+-------+-------+-------+-------+-------+-------+ * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | * +--------+-------+-------+-------+-------+-------+-------+ * * +--------+-------+-------+-------+-------+-------+-------+ * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | * +--------+-------+-------+-------+-------+-------+-------+ * * The header is called struct xfs_btree_block for reasons better left unknown * and comes in different versions for short (32bit) and long (64bit) block * pointers. The record and key structures are defined by the btree instances * and opaque to the btree core. The block pointers are simple disk endian * integers, available in a short (32bit) and long (64bit) variant. * * The helpers below calculate the offset of a given record, key or pointer * into a btree block (xfs_btree_*_offset) or return a pointer to the given * record, key or pointer (xfs_btree_*_addr). Note that all addressing * inside the btree block is done using indices starting at one, not zero! * * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing * overlapping intervals. In such a tree, records are still sorted lowest to * highest and indexed by the smallest key value that refers to the record. * However, nodes are different: each pointer has two associated keys -- one * indexing the lowest key available in the block(s) below (the same behavior * as the key in a regular btree) and another indexing the highest key * available in the block(s) below. Because records are /not/ sorted by the * highest key, all leaf block updates require us to compute the highest key * that matches any record in the leaf and to recursively update the high keys * in the nodes going further up in the tree, if necessary. Nodes look like * this: * * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ * Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... | * +--------+-----+-----+-----+-----+-----+-------+-------+-----+ * * To perform an interval query on an overlapped tree, perform the usual * depth-first search and use the low and high keys to decide if we can skip * that particular node. If a leaf node is reached, return the records that * intersect the interval. Note that an interval query may return numerous * entries. For a non-overlapped tree, simply search for the record associated * with the lowest key and iterate forward until a non-matching record is * found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in * more detail. * * Why do we care about overlapping intervals? Let's say you have a bunch of * reverse mapping records on a reflink filesystem: * * 1: +- file A startblock B offset C length D -----------+ * 2: +- file E startblock F offset G length H --------------+ * 3: +- file I startblock F offset J length K --+ * 4: +- file L... --+ * * Now say we want to map block (B+D) into file A at offset (C+D). Ideally, * we'd simply increment the length of record 1. But how do we find the record * that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return * record 3 because the keys are ordered first by startblock. An interval * query would return records 1 and 2 because they both overlap (B+D-1), and * from that we can pick out record 1 as the appropriate left neighbor. * * In the non-overlapped case you can do a LE lookup and decrement the cursor * because a record's interval must end before the next record. */ /* * Return size of the btree block header for this btree instance. */ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (xfs_has_crc(cur->bc_mp)) return XFS_BTREE_LBLOCK_CRC_LEN; return XFS_BTREE_LBLOCK_LEN; } if (xfs_has_crc(cur->bc_mp)) return XFS_BTREE_SBLOCK_CRC_LEN; return XFS_BTREE_SBLOCK_LEN; } /* * Calculate offset of the n-th record in a btree block. */ STATIC size_t xfs_btree_rec_offset( struct xfs_btree_cur *cur, int n) { return xfs_btree_block_len(cur) + (n - 1) * cur->bc_ops->rec_len; } /* * Calculate offset of the n-th key in a btree block. */ STATIC size_t xfs_btree_key_offset( struct xfs_btree_cur *cur, int n) { return xfs_btree_block_len(cur) + (n - 1) * cur->bc_ops->key_len; } /* * Calculate offset of the n-th high key in a btree block. */ STATIC size_t xfs_btree_high_key_offset( struct xfs_btree_cur *cur, int n) { return xfs_btree_block_len(cur) + (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2); } /* * Calculate offset of the n-th block pointer in a btree block. */ STATIC size_t xfs_btree_ptr_offset( struct xfs_btree_cur *cur, int n, int level) { return xfs_btree_block_len(cur) + cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + (n - 1) * cur->bc_ops->ptr_len; } /* * Return a pointer to the n-th record in the btree block. */ union xfs_btree_rec * xfs_btree_rec_addr( struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block) { return (union xfs_btree_rec *) ((char *)block + xfs_btree_rec_offset(cur, n)); } /* * Return a pointer to the n-th key in the btree block. */ union xfs_btree_key * xfs_btree_key_addr( struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block) { return (union xfs_btree_key *) ((char *)block + xfs_btree_key_offset(cur, n)); } /* * Return a pointer to the n-th high key in the btree block. */ union xfs_btree_key * xfs_btree_high_key_addr( struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block) { return (union xfs_btree_key *) ((char *)block + xfs_btree_high_key_offset(cur, n)); } /* * Return a pointer to the n-th block pointer in the btree block. */ union xfs_btree_ptr * xfs_btree_ptr_addr( struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block) { int level = xfs_btree_get_level(block); ASSERT(block->bb_level != 0); return (union xfs_btree_ptr *) ((char *)block + xfs_btree_ptr_offset(cur, n, level)); } struct xfs_ifork * xfs_btree_ifork_ptr( struct xfs_btree_cur *cur) { ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); if (cur->bc_flags & XFS_BTREE_STAGING) return cur->bc_ino.ifake->if_fork; return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); } /* * Get the root block which is stored in the inode. * * For now this btree implementation assumes the btree root is always * stored in the if_broot field of an inode fork. */ STATIC struct xfs_btree_block * xfs_btree_get_iroot( struct xfs_btree_cur *cur) { struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return (struct xfs_btree_block *)ifp->if_broot; } /* * Retrieve the block pointer from the cursor at the given level. * This may be an inode btree root or from a buffer. */ struct xfs_btree_block * /* generic btree block pointer */ xfs_btree_get_block( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level in btree */ struct xfs_buf **bpp) /* buffer containing the block */ { if (xfs_btree_at_iroot(cur, level)) { *bpp = NULL; return xfs_btree_get_iroot(cur); } *bpp = cur->bc_levels[level].bp; return XFS_BUF_TO_BLOCK(*bpp); } /* * Change the cursor to point to the first record at the given level. * Other levels are unaffected. */ STATIC int /* success=1, failure=0 */ xfs_btree_firstrec( struct xfs_btree_cur *cur, /* btree cursor */ int level) /* level to change */ { struct xfs_btree_block *block; /* generic btree block pointer */ struct xfs_buf *bp; /* buffer containing block */ /* * Get the block pointer for this level. */ block = xfs_btree_get_block(cur, level, &bp); if (xfs_btree_check_block(cur, block, level, bp)) return 0; /* * It's empty, there is no such record. */ if (!block->bb_numrecs) return 0; /* * Set the ptr value to 1, that's the first record/key. */ cur->bc_levels[level].ptr = 1; return 1; } /* * Change the cursor to point to the last record in the current block * at the given level. Other levels are unaffected. */ STATIC int /* success=1, failure=0 */ xfs_btree_lastrec( struct xfs_btree_cur *cur, /* btree cursor */ int level) /* level to change */ { struct xfs_btree_block *block; /* generic btree block pointer */ struct xfs_buf *bp; /* buffer containing block */ /* * Get the block pointer for this level. */ block = xfs_btree_get_block(cur, level, &bp); if (xfs_btree_check_block(cur, block, level, bp)) return 0; /* * It's empty, there is no such record. */ if (!block->bb_numrecs) return 0; /* * Set the ptr value to numrecs, that's the last record/key. */ cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); return 1; } /* * Compute first and last byte offsets for the fields given. * Interprets the offsets table, which contains struct field offsets. */ void xfs_btree_offsets( uint32_t fields, /* bitmask of fields */ const short *offsets, /* table of field offsets */ int nbits, /* number of bits to inspect */ int *first, /* output: first byte offset */ int *last) /* output: last byte offset */ { int i; /* current bit number */ uint32_t imask; /* mask for current bit number */ ASSERT(fields != 0); /* * Find the lowest bit, so the first byte offset. */ for (i = 0, imask = 1u; ; i++, imask <<= 1) { if (imask & fields) { *first = offsets[i]; break; } } /* * Find the highest bit, so the last byte offset. */ for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) { if (imask & fields) { *last = offsets[i + 1] - 1; break; } } } STATIC int xfs_btree_readahead_fsblock( struct xfs_btree_cur *cur, int lr, struct xfs_btree_block *block) { struct xfs_mount *mp = cur->bc_mp; xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left), mp->m_bsize, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right), mp->m_bsize, cur->bc_ops->buf_ops); rval++; } return rval; } STATIC int xfs_btree_readahead_memblock( struct xfs_btree_cur *cur, int lr, struct xfs_btree_block *block) { struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE, cur->bc_ops->buf_ops); rval++; } return rval; } STATIC int xfs_btree_readahead_agblock( struct xfs_btree_cur *cur, int lr, struct xfs_btree_block *block) { struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = to_perag(cur->bc_group); xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, xfs_agbno_to_daddr(pag, left), mp->m_bsize, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { xfs_buf_readahead(mp->m_ddev_targp, xfs_agbno_to_daddr(pag, right), mp->m_bsize, cur->bc_ops->buf_ops); rval++; } return rval; } /* * Read-ahead btree blocks, at the given level. * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. */ STATIC int xfs_btree_readahead( struct xfs_btree_cur *cur, /* btree cursor */ int lev, /* level in btree */ int lr) /* left/right bits */ { struct xfs_btree_block *block; /* * No readahead needed if we are at the root level and the * btree root is stored in the inode. */ if (xfs_btree_at_iroot(cur, lev)) return 0; if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) return 0; cur->bc_levels[lev].ra |= lr; block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_AG: return xfs_btree_readahead_agblock(cur, lr, block); case XFS_BTREE_TYPE_INODE: return xfs_btree_readahead_fsblock(cur, lr, block); case XFS_BTREE_TYPE_MEM: return xfs_btree_readahead_memblock(cur, lr, block); default: ASSERT(0); return 0; } } STATIC int xfs_btree_ptr_to_daddr( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, xfs_daddr_t *daddr) { int error; error = xfs_btree_check_ptr(cur, ptr, 0, 1); if (error) return error; switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_AG: *daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group), be32_to_cpu(ptr->s)); break; case XFS_BTREE_TYPE_INODE: *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); break; case XFS_BTREE_TYPE_MEM: *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l)); break; } return 0; } /* * Readahead @count btree blocks at the given @ptr location. * * We don't need to care about long or short form btrees here as we have a * method of converting the ptr directly to a daddr available to us. */ STATIC void xfs_btree_readahead_ptr( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, xfs_extlen_t count) { xfs_daddr_t daddr; if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) return; xfs_buf_readahead(xfs_btree_buftarg(cur), daddr, xfs_btree_bbsize(cur) * count, cur->bc_ops->buf_ops); } /* * Set the buffer for level "lev" in the cursor to bp, releasing * any previous buffer. */ STATIC void xfs_btree_setbuf( struct xfs_btree_cur *cur, /* btree cursor */ int lev, /* level in btree */ struct xfs_buf *bp) /* new buffer to set */ { struct xfs_btree_block *b; /* btree block */ if (cur->bc_levels[lev].bp) xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); cur->bc_levels[lev].bp = bp; cur->bc_levels[lev].ra = 0; b = XFS_BUF_TO_BLOCK(bp); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; } else { if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; } } bool xfs_btree_ptr_is_null( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return ptr->l == cpu_to_be64(NULLFSBLOCK); else return ptr->s == cpu_to_be32(NULLAGBLOCK); } void xfs_btree_set_ptr_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ptr->l = cpu_to_be64(NULLFSBLOCK); else ptr->s = cpu_to_be32(NULLAGBLOCK); } static inline bool xfs_btree_ptrs_equal( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr1, union xfs_btree_ptr *ptr2) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return ptr1->l == ptr2->l; return ptr1->s == ptr2->s; } /* * Get/set/init sibling pointers */ void xfs_btree_get_sibling( struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_ptr *ptr, int lr) { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (lr == XFS_BB_RIGHTSIB) ptr->l = block->bb_u.l.bb_rightsib; else ptr->l = block->bb_u.l.bb_leftsib; } else { if (lr == XFS_BB_RIGHTSIB) ptr->s = block->bb_u.s.bb_rightsib; else ptr->s = block->bb_u.s.bb_leftsib; } } void xfs_btree_set_sibling( struct xfs_btree_cur *cur, struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, int lr) { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (lr == XFS_BB_RIGHTSIB) block->bb_u.l.bb_rightsib = ptr->l; else block->bb_u.l.bb_leftsib = ptr->l; } else { if (lr == XFS_BB_RIGHTSIB) block->bb_u.s.bb_rightsib = ptr->s; else block->bb_u.s.bb_leftsib = ptr->s; } } static void __xfs_btree_init_block( struct xfs_mount *mp, struct xfs_btree_block *buf, const struct xfs_btree_ops *ops, xfs_daddr_t blkno, __u16 level, __u16 numrecs, __u64 owner) { bool crc = xfs_has_crc(mp); __u32 magic = xfs_btree_magic(mp, ops); buf->bb_magic = cpu_to_be32(magic); buf->bb_level = cpu_to_be16(level); buf->bb_numrecs = cpu_to_be16(numrecs); if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); if (crc) { buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); buf->bb_u.l.bb_owner = cpu_to_be64(owner); uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.l.bb_pad = 0; buf->bb_u.l.bb_lsn = 0; } } else { buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); if (crc) { buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); /* owner is a 32 bit value on short blocks */ buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner); uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.s.bb_lsn = 0; } } } void xfs_btree_init_block( struct xfs_mount *mp, struct xfs_btree_block *block, const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, __u64 owner) { __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level, numrecs, owner); } void xfs_btree_init_buf( struct xfs_mount *mp, struct xfs_buf *bp, const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, __u64 owner) { __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, xfs_buf_daddr(bp), level, numrecs, owner); bp->b_ops = ops->buf_ops; } static inline __u64 xfs_btree_owner( struct xfs_btree_cur *cur) { switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_MEM: return cur->bc_mem.xfbtree->owner; case XFS_BTREE_TYPE_INODE: return cur->bc_ino.ip->i_ino; case XFS_BTREE_TYPE_AG: return cur->bc_group->xg_gno; default: ASSERT(0); return 0; } } void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, struct xfs_buf *bp, int level, int numrecs) { xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, xfs_btree_owner(cur)); } STATIC void xfs_btree_buf_to_ptr( struct xfs_btree_cur *cur, struct xfs_buf *bp, union xfs_btree_ptr *ptr) { switch (cur->bc_ops->type) { case XFS_BTREE_TYPE_AG: ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp))); break; case XFS_BTREE_TYPE_INODE: ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp))); break; case XFS_BTREE_TYPE_MEM: ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp))); break; } } static inline void xfs_btree_set_refs( struct xfs_btree_cur *cur, struct xfs_buf *bp) { xfs_buf_set_ref(bp, cur->bc_ops->lru_refs); } int xfs_btree_get_buf_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, struct xfs_buf **bpp) { xfs_daddr_t d; int error; error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d, xfs_btree_bbsize(cur), 0, bpp); if (error) return error; (*bpp)->b_ops = cur->bc_ops->buf_ops; *block = XFS_BUF_TO_BLOCK(*bpp); return 0; } /* * Read in the buffer at the given ptr and return the buffer and * the block pointer within the buffer. */ int xfs_btree_read_buf_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int flags, struct xfs_btree_block **block, struct xfs_buf **bpp) { struct xfs_mount *mp = cur->bc_mp; xfs_daddr_t d; int error; /* need to sort out how callers deal with failures first */ ASSERT(!(flags & XBF_TRYLOCK)); error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d, xfs_btree_bbsize(cur), flags, bpp, cur->bc_ops->buf_ops); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(cur); if (error) return error; xfs_btree_set_refs(cur, *bpp); *block = XFS_BUF_TO_BLOCK(*bpp); return 0; } /* * Copy keys from one btree block to another. */ void xfs_btree_copy_keys( struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, const union xfs_btree_key *src_key, int numkeys) { ASSERT(numkeys >= 0); memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); } /* * Copy records from one btree block to another. */ STATIC void xfs_btree_copy_recs( struct xfs_btree_cur *cur, union xfs_btree_rec *dst_rec, union xfs_btree_rec *src_rec, int numrecs) { ASSERT(numrecs >= 0); memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); } /* * Copy block pointers from one btree block to another. */ void xfs_btree_copy_ptrs( struct xfs_btree_cur *cur, union xfs_btree_ptr *dst_ptr, const union xfs_btree_ptr *src_ptr, int numptrs) { ASSERT(numptrs >= 0); memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len); } /* * Shift keys one index left/right inside a single btree block. */ STATIC void xfs_btree_shift_keys( struct xfs_btree_cur *cur, union xfs_btree_key *key, int dir, int numkeys) { char *dst_key; ASSERT(numkeys >= 0); ASSERT(dir == 1 || dir == -1); dst_key = (char *)key + (dir * cur->bc_ops->key_len); memmove(dst_key, key, numkeys * cur->bc_ops->key_len); } /* * Shift records one index left/right inside a single btree block. */ STATIC void xfs_btree_shift_recs( struct xfs_btree_cur *cur, union xfs_btree_rec *rec, int dir, int numrecs) { char *dst_rec; ASSERT(numrecs >= 0); ASSERT(dir == 1 || dir == -1); dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); } /* * Shift block pointers one index left/right inside a single btree block. */ STATIC void xfs_btree_shift_ptrs( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, int dir, int numptrs) { char *dst_ptr; ASSERT(numptrs >= 0); ASSERT(dir == 1 || dir == -1); dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len); memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len); } /* * Log key values from the btree block. */ STATIC void xfs_btree_log_keys( struct xfs_btree_cur *cur, struct xfs_buf *bp, int first, int last) { if (bp) { xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_key_offset(cur, first), xfs_btree_key_offset(cur, last + 1) - 1); } else { xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } /* * Log record values from the btree block. */ void xfs_btree_log_recs( struct xfs_btree_cur *cur, struct xfs_buf *bp, int first, int last) { xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_rec_offset(cur, first), xfs_btree_rec_offset(cur, last + 1) - 1); } /* * Log block pointer fields from a btree block (nonleaf). */ STATIC void xfs_btree_log_ptrs( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_buf *bp, /* buffer containing btree block */ int first, /* index of first pointer to log */ int last) /* index of last pointer to log */ { if (bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); int level = xfs_btree_get_level(block); xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_ptr_offset(cur, first, level), xfs_btree_ptr_offset(cur, last + 1, level) - 1); } else { xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } /* * Log fields from a btree block header. */ void xfs_btree_log_block( struct xfs_btree_cur *cur, /* btree cursor */ struct xfs_buf *bp, /* buffer containing btree block */ uint32_t fields) /* mask of fields: XFS_BB_... */ { int first; /* first byte offset logged */ int last; /* last byte offset logged */ static const short soffsets[] = { /* table of offsets (short) */ offsetof(struct xfs_btree_block, bb_magic), offsetof(struct xfs_btree_block, bb_level), offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), offsetof(struct xfs_btree_block, bb_u.s.bb_owner), offsetof(struct xfs_btree_block, bb_u.s.bb_crc), XFS_BTREE_SBLOCK_CRC_LEN }; static const short loffsets[] = { /* table of offsets (long) */ offsetof(struct xfs_btree_block, bb_magic), offsetof(struct xfs_btree_block, bb_level), offsetof(struct xfs_btree_block, bb_numrecs), offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), offsetof(struct xfs_btree_block, bb_u.l.bb_owner), offsetof(struct xfs_btree_block, bb_u.l.bb_crc), offsetof(struct xfs_btree_block, bb_u.l.bb_pad), XFS_BTREE_LBLOCK_CRC_LEN }; if (bp) { int nbits; if (xfs_has_crc(cur->bc_mp)) { /* * We don't log the CRC when updating a btree * block but instead recreate it during log * recovery. As the log buffers have checksums * of their own this is safe and avoids logging a crc * update in a lot of places. */ if (fields == XFS_BB_ALL_BITS) fields = XFS_BB_ALL_BITS_CRC; nbits = XFS_BB_NUM_BITS_CRC; } else { nbits = XFS_BB_NUM_BITS; } xfs_btree_offsets(fields, (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ? loffsets : soffsets, nbits, &first, &last); xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, first, last); } else { xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } /* * Increment cursor by one record at the level. * For nonzero levels the leaf-ward information is untouched. */ int /* error */ xfs_btree_increment( struct xfs_btree_cur *cur, int level, int *stat) /* success/failure */ { struct xfs_btree_block *block; union xfs_btree_ptr ptr; struct xfs_buf *bp; int error; /* error return value */ int lev; ASSERT(level < cur->bc_nlevels); /* Read-ahead to the right at this level. */ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); /* Get a pointer to the btree block. */ block = xfs_btree_get_block(cur, level, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) goto error0; #endif /* We're done if we remain in the block after the increment. */ if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) goto out1; /* Fail if we just went off the right edge of the tree. */ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); if (xfs_btree_ptr_is_null(cur, &ptr)) goto out0; XFS_BTREE_STATS_INC(cur, increment); /* * March up the tree incrementing pointers. * Stop when we don't go off the right edge of a block. */ for (lev = level + 1; lev < cur->bc_nlevels; lev++) { block = xfs_btree_get_block(cur, lev, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, lev, bp); if (error) goto error0; #endif if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) break; /* Read-ahead the right block for the next loop. */ xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); } /* * If we went off the root then we are either seriously * confused or have the tree root in an inode. */ if (lev == cur->bc_nlevels) { if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) goto out0; ASSERT(0); xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } ASSERT(lev < cur->bc_nlevels); /* * Now walk back down the tree, fixing up the cursor's buffer * pointers and key numbers. */ for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { union xfs_btree_ptr *ptrp; ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); --lev; error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; xfs_btree_setbuf(cur, lev, bp); cur->bc_levels[lev].ptr = 1; } out1: *stat = 1; return 0; out0: *stat = 0; return 0; error0: return error; } /* * Decrement cursor by one record at the level. * For nonzero levels the leaf-ward information is untouched. */ int /* error */ xfs_btree_decrement( struct xfs_btree_cur *cur, int level, int *stat) /* success/failure */ { struct xfs_btree_block *block; struct xfs_buf *bp; int error; /* error return value */ int lev; union xfs_btree_ptr ptr; ASSERT(level < cur->bc_nlevels); /* Read-ahead to the left at this level. */ xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); /* We're done if we remain in the block after the decrement. */ if (--cur->bc_levels[level].ptr > 0) goto out1; /* Get a pointer to the btree block. */ block = xfs_btree_get_block(cur, level, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) goto error0; #endif /* Fail if we just went off the left edge of the tree. */ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); if (xfs_btree_ptr_is_null(cur, &ptr)) goto out0; XFS_BTREE_STATS_INC(cur, decrement); /* * March up the tree decrementing pointers. * Stop when we don't go off the left edge of a block. */ for (lev = level + 1; lev < cur->bc_nlevels; lev++) { if (--cur->bc_levels[lev].ptr > 0) break; /* Read-ahead the left block for the next loop. */ xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); } /* * If we went off the root then we are seriously confused. * or the root of the tree is in an inode. */ if (lev == cur->bc_nlevels) { if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) goto out0; ASSERT(0); xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } ASSERT(lev < cur->bc_nlevels); /* * Now walk back down the tree, fixing up the cursor's buffer * pointers and key numbers. */ for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { union xfs_btree_ptr *ptrp; ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); --lev; error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; xfs_btree_setbuf(cur, lev, bp); cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); } out1: *stat = 1; return 0; out0: *stat = 0; return 0; error0: return error; } /* * Check the btree block owner now that we have the context to know who the * real owner is. */ static inline xfs_failaddr_t xfs_btree_check_block_owner( struct xfs_btree_cur *cur, struct xfs_btree_block *block) { __u64 owner; if (!xfs_has_crc(cur->bc_mp) || (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER)) return NULL; owner = xfs_btree_owner(cur); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (be64_to_cpu(block->bb_u.l.bb_owner) != owner) return __this_address; } else { if (be32_to_cpu(block->bb_u.s.bb_owner) != owner) return __this_address; } return NULL; } int xfs_btree_lookup_get_block( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level in the btree */ const union xfs_btree_ptr *pp, /* ptr to btree block */ struct xfs_btree_block **blkp) /* return btree block */ { struct xfs_buf *bp; /* buffer pointer for btree block */ xfs_daddr_t daddr; int error = 0; /* special case the root block if in an inode */ if (xfs_btree_at_iroot(cur, level)) { *blkp = xfs_btree_get_iroot(cur); return 0; } /* * If the old buffer at this level for the disk address we are * looking for re-use it. * * Otherwise throw it away and get a new one. */ bp = cur->bc_levels[level].bp; error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); if (error) return error; if (bp && xfs_buf_daddr(bp) == daddr) { *blkp = XFS_BUF_TO_BLOCK(bp); return 0; } error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); if (error) return error; /* Check the inode owner since the verifiers don't. */ if (xfs_btree_check_block_owner(cur, *blkp) != NULL) goto out_bad; /* Did we get the level we were looking for? */ if (be16_to_cpu((*blkp)->bb_level) != level) goto out_bad; /* Check that internal nodes have at least one record. */ if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) goto out_bad; xfs_btree_setbuf(cur, level, bp); return 0; out_bad: *blkp = NULL; xfs_buf_mark_corrupt(bp); xfs_trans_brelse(cur->bc_tp, bp); xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } /* * Get current search key. For level 0 we don't actually have a key * structure so we make one up from the record. For all other levels * we just return the right key. */ STATIC union xfs_btree_key * xfs_lookup_get_search_key( struct xfs_btree_cur *cur, int level, int keyno, struct xfs_btree_block *block, union xfs_btree_key *kp) { if (level == 0) { cur->bc_ops->init_key_from_rec(kp, xfs_btree_rec_addr(cur, keyno, block)); return kp; } return xfs_btree_key_addr(cur, keyno, block); } /* * Initialize a pointer to the root block. */ void xfs_btree_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { /* * Inode-rooted btrees call xfs_btree_get_iroot to find the root * in xfs_btree_lookup_get_block and don't need a pointer here. */ ptr->l = 0; } else if (cur->bc_flags & XFS_BTREE_STAGING) { ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root); } else { cur->bc_ops->init_ptr_from_cur(cur, ptr); } } /* * Lookup the record. The cursor is made to point to it, based on dir. * stat is set to 0 if can't find any such record, 1 for success. */ int /* error */ xfs_btree_lookup( struct xfs_btree_cur *cur, /* btree cursor */ xfs_lookup_t dir, /* <=, ==, or >= */ int *stat) /* success/failure */ { struct xfs_btree_block *block; /* current btree block */ int64_t diff; /* difference for the current key */ int error; /* error return value */ int keyno; /* current key number */ int level; /* level in the btree */ union xfs_btree_ptr *pp; /* ptr to btree block */ union xfs_btree_ptr ptr; /* ptr to btree block */ XFS_BTREE_STATS_INC(cur, lookup); /* No such thing as a zero-level tree. */ if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } block = NULL; keyno = 0; /* initialise start pointer from cursor */ xfs_btree_init_ptr_from_cur(cur, &ptr); pp = &ptr; /* * Iterate over each level in the btree, starting at the root. * For each level above the leaves, find the key we need, based * on the lookup record, then follow the corresponding block * pointer down to the next level. */ for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { /* Get the block we need to do the lookup on. */ error = xfs_btree_lookup_get_block(cur, level, pp, &block); if (error) goto error0; if (diff == 0) { /* * If we already had a key match at a higher level, we * know we need to use the first entry in this block. */ keyno = 1; } else { /* Otherwise search this block. Do a binary search. */ int high; /* high entry number */ int low; /* low entry number */ /* Set low and high entry numbers, 1-based. */ low = 1; high = xfs_btree_get_numrecs(block); if (!high) { /* Block is empty, must be an empty leaf. */ if (level != 0 || cur->bc_nlevels != 1) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, cur->bc_mp, block, sizeof(*block)); xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; *stat = 0; return 0; } /* Binary search the block. */ while (low <= high) { union xfs_btree_key key; union xfs_btree_key *kp; XFS_BTREE_STATS_INC(cur, compare); /* keyno is average of low and high. */ keyno = (low + high) >> 1; /* Get current search key */ kp = xfs_lookup_get_search_key(cur, level, keyno, block, &key); /* * Compute difference to get next direction: * - less than, move right * - greater than, move left * - equal, we're done */ diff = cur->bc_ops->key_diff(cur, kp); if (diff < 0) low = keyno + 1; else if (diff > 0) high = keyno - 1; else break; } } /* * If there are more levels, set up for the next level * by getting the block number and filling in the cursor. */ if (level > 0) { /* * If we moved left, need the previous key number, * unless there isn't one. */ if (diff > 0 && --keyno < 1) keyno = 1; pp = xfs_btree_ptr_addr(cur, keyno, block); error = xfs_btree_debug_check_ptr(cur, pp, 0, level); if (error) goto error0; cur->bc_levels[level].ptr = keyno; } } /* Done with the search. See if we need to adjust the results. */ if (dir != XFS_LOOKUP_LE && diff < 0) { keyno++; /* * If ge search and we went off the end of the block, but it's * not the last block, we're in the wrong block. */ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); if (dir == XFS_LOOKUP_GE && keyno > xfs_btree_get_numrecs(block) && !xfs_btree_ptr_is_null(cur, &ptr)) { int i; cur->bc_levels[0].ptr = keyno; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } *stat = 1; return 0; } } else if (dir == XFS_LOOKUP_LE && diff > 0) keyno--; cur->bc_levels[0].ptr = keyno; /* Return if we succeeded or not. */ if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) *stat = 0; else if (dir != XFS_LOOKUP_EQ || diff == 0) *stat = 1; else *stat = 0; return 0; error0: return error; } /* Find the high key storage area from a regular key. */ union xfs_btree_key * xfs_btree_high_key_from_key( struct xfs_btree_cur *cur, union xfs_btree_key *key) { ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); return (union xfs_btree_key *)((char *)key + (cur->bc_ops->key_len / 2)); } /* Determine the low (and high if overlapped) keys of a leaf block */ STATIC void xfs_btree_get_leaf_keys( struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key) { union xfs_btree_key max_hkey; union xfs_btree_key hkey; union xfs_btree_rec *rec; union xfs_btree_key *high; int n; rec = xfs_btree_rec_addr(cur, 1, block); cur->bc_ops->init_key_from_rec(key, rec); if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { rec = xfs_btree_rec_addr(cur, n, block); cur->bc_ops->init_high_key_from_rec(&hkey, rec); if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey)) max_hkey = hkey; } high = xfs_btree_high_key_from_key(cur, key); memcpy(high, &max_hkey, cur->bc_ops->key_len / 2); } } /* Determine the low (and high if overlapped) keys of a node block */ STATIC void xfs_btree_get_node_keys( struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key) { union xfs_btree_key *hkey; union xfs_btree_key *max_hkey; union xfs_btree_key *high; int n; if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { memcpy(key, xfs_btree_key_addr(cur, 1, block), cur->bc_ops->key_len / 2); max_hkey = xfs_btree_high_key_addr(cur, 1, block); for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { hkey = xfs_btree_high_key_addr(cur, n, block); if (xfs_btree_keycmp_gt(cur, hkey, max_hkey)) max_hkey = hkey; } high = xfs_btree_high_key_from_key(cur, key); memcpy(high, max_hkey, cur->bc_ops->key_len / 2); } else { memcpy(key, xfs_btree_key_addr(cur, 1, block), cur->bc_ops->key_len); } } /* Derive the keys for any btree block. */ void xfs_btree_get_keys( struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key) { if (be16_to_cpu(block->bb_level) == 0) xfs_btree_get_leaf_keys(cur, block, key); else xfs_btree_get_node_keys(cur, block, key); } /* * Decide if we need to update the parent keys of a btree block. For * a standard btree this is only necessary if we're updating the first * record/key. For an overlapping btree, we must always update the * keys because the highest key can be in any of the records or keys * in the block. */ static inline bool xfs_btree_needs_key_update( struct xfs_btree_cur *cur, int ptr) { return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1; } /* * Update the low and high parent keys of the given level, progressing * towards the root. If force_all is false, stop if the keys for a given * level do not need updating. */ STATIC int __xfs_btree_updkeys( struct xfs_btree_cur *cur, int level, struct xfs_btree_block *block, struct xfs_buf *bp0, bool force_all) { union xfs_btree_key key; /* keys from current level */ union xfs_btree_key *lkey; /* keys from the next level up */ union xfs_btree_key *hkey; union xfs_btree_key *nlkey; /* keys from the next level up */ union xfs_btree_key *nhkey; struct xfs_buf *bp; int ptr; ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); /* Exit if there aren't any parent levels to update. */ if (level + 1 >= cur->bc_nlevels) return 0; trace_xfs_btree_updkeys(cur, level, bp0); lkey = &key; hkey = xfs_btree_high_key_from_key(cur, lkey); xfs_btree_get_keys(cur, block, lkey); for (level++; level < cur->bc_nlevels; level++) { #ifdef DEBUG int error; #endif block = xfs_btree_get_block(cur, level, &bp); trace_xfs_btree_updkeys(cur, level, bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) return error; #endif ptr = cur->bc_levels[level].ptr; nlkey = xfs_btree_key_addr(cur, ptr, block); nhkey = xfs_btree_high_key_addr(cur, ptr, block); if (!force_all && xfs_btree_keycmp_eq(cur, nlkey, lkey) && xfs_btree_keycmp_eq(cur, nhkey, hkey)) break; xfs_btree_copy_keys(cur, nlkey, lkey, 1); xfs_btree_log_keys(cur, bp, ptr, ptr); if (level + 1 >= cur->bc_nlevels) break; xfs_btree_get_node_keys(cur, block, lkey); } return 0; } /* Update all the keys from some level in cursor back to the root. */ STATIC int xfs_btree_updkeys_force( struct xfs_btree_cur *cur, int level) { struct xfs_buf *bp; struct xfs_btree_block *block; block = xfs_btree_get_block(cur, level, &bp); return __xfs_btree_updkeys(cur, level, block, bp, true); } /* * Update the parent keys of the given level, progressing towards the root. */ STATIC int xfs_btree_update_keys( struct xfs_btree_cur *cur, int level) { struct xfs_btree_block *block; struct xfs_buf *bp; union xfs_btree_key *kp; union xfs_btree_key key; int ptr; ASSERT(level >= 0); block = xfs_btree_get_block(cur, level, &bp); if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) return __xfs_btree_updkeys(cur, level, block, bp, false); /* * Go up the tree from this level toward the root. * At each level, update the key value to the value input. * Stop when we reach a level where the cursor isn't pointing * at the first entry in the block. */ xfs_btree_get_keys(cur, block, &key); for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { #ifdef DEBUG int error; #endif block = xfs_btree_get_block(cur, level, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) return error; #endif ptr = cur->bc_levels[level].ptr; kp = xfs_btree_key_addr(cur, ptr, block); xfs_btree_copy_keys(cur, kp, &key, 1); xfs_btree_log_keys(cur, bp, ptr, ptr); } return 0; } /* * Update the record referred to by cur to the value in the * given record. This either works (return 0) or gets an * EFSCORRUPTED error. */ int xfs_btree_update( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { struct xfs_btree_block *block; struct xfs_buf *bp; int error; int ptr; union xfs_btree_rec *rp; /* Pick up the current block. */ block = xfs_btree_get_block(cur, 0, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, 0, bp); if (error) goto error0; #endif /* Get the address of the rec to be updated. */ ptr = cur->bc_levels[0].ptr; rp = xfs_btree_rec_addr(cur, ptr, block); /* Fill in the new contents and log them. */ xfs_btree_copy_recs(cur, rp, rec, 1); xfs_btree_log_recs(cur, bp, ptr, ptr); /* Pass new key value up to our parent. */ if (xfs_btree_needs_key_update(cur, ptr)) { error = xfs_btree_update_keys(cur, 0); if (error) goto error0; } return 0; error0: return error; } /* * Move 1 record left from cur/level if possible. * Update cur to reflect the new path. */ STATIC int /* error */ xfs_btree_lshift( struct xfs_btree_cur *cur, int level, int *stat) /* success/failure */ { struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ int lrecs; /* left record count */ struct xfs_buf *rbp; /* right buffer pointer */ struct xfs_btree_block *right; /* right btree block */ struct xfs_btree_cur *tcur; /* temporary btree cursor */ int rrecs; /* right record count */ union xfs_btree_ptr lptr; /* left btree pointer */ union xfs_btree_key *rkp = NULL; /* right btree key */ union xfs_btree_ptr *rpp = NULL; /* right address pointer */ union xfs_btree_rec *rrp = NULL; /* right record pointer */ int error; /* error return value */ int i; if (xfs_btree_at_iroot(cur, level)) goto out0; /* Set up variables for this block as "right". */ right = xfs_btree_get_block(cur, level, &rbp); #ifdef DEBUG error = xfs_btree_check_block(cur, right, level, rbp); if (error) goto error0; #endif /* If we've got no left sibling then we can't shift an entry left. */ xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); if (xfs_btree_ptr_is_null(cur, &lptr)) goto out0; /* * If the cursor entry is the one that would be moved, don't * do it... it's too complicated. */ if (cur->bc_levels[level].ptr <= 1) goto out0; /* Set up the left neighbor as "left". */ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; /* If it's full, it can't take another entry. */ lrecs = xfs_btree_get_numrecs(left); if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) goto out0; rrecs = xfs_btree_get_numrecs(right); /* * We add one entry to the left side and remove one for the right side. * Account for it here, the changes will be updated on disk and logged * later. */ lrecs++; rrecs--; XFS_BTREE_STATS_INC(cur, lshift); XFS_BTREE_STATS_ADD(cur, moves, 1); /* * If non-leaf, copy a key and a ptr to the left block. * Log the changes to the left block. */ if (level > 0) { /* It's a non-leaf. Move keys and pointers. */ union xfs_btree_key *lkp; /* left btree key */ union xfs_btree_ptr *lpp; /* left address pointer */ lkp = xfs_btree_key_addr(cur, lrecs, left); rkp = xfs_btree_key_addr(cur, 1, right); lpp = xfs_btree_ptr_addr(cur, lrecs, left); rpp = xfs_btree_ptr_addr(cur, 1, right); error = xfs_btree_debug_check_ptr(cur, rpp, 0, level); if (error) goto error0; xfs_btree_copy_keys(cur, lkp, rkp, 1); xfs_btree_copy_ptrs(cur, lpp, rpp, 1); xfs_btree_log_keys(cur, lbp, lrecs, lrecs); xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); ASSERT(cur->bc_ops->keys_inorder(cur, xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); } else { /* It's a leaf. Move records. */ union xfs_btree_rec *lrp; /* left record pointer */ lrp = xfs_btree_rec_addr(cur, lrecs, left); rrp = xfs_btree_rec_addr(cur, 1, right); xfs_btree_copy_recs(cur, lrp, rrp, 1); xfs_btree_log_recs(cur, lbp, lrecs, lrecs); ASSERT(cur->bc_ops->recs_inorder(cur, xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); } xfs_btree_set_numrecs(left, lrecs); xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); xfs_btree_set_numrecs(right, rrecs); xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); /* * Slide the contents of right down one entry. */ XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); if (level > 0) { /* It's a nonleaf. operate on keys and ptrs */ for (i = 0; i < rrecs; i++) { error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level); if (error) goto error0; } xfs_btree_shift_keys(cur, xfs_btree_key_addr(cur, 2, right), -1, rrecs); xfs_btree_shift_ptrs(cur, xfs_btree_ptr_addr(cur, 2, right), -1, rrecs); xfs_btree_log_keys(cur, rbp, 1, rrecs); xfs_btree_log_ptrs(cur, rbp, 1, rrecs); } else { /* It's a leaf. operate on records */ xfs_btree_shift_recs(cur, xfs_btree_rec_addr(cur, 2, right), -1, rrecs); xfs_btree_log_recs(cur, rbp, 1, rrecs); } /* * Using a temporary cursor, update the parent key values of the * block on the left. */ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_dup_cursor(cur, &tcur); if (error) goto error0; i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } error = xfs_btree_decrement(tcur, level, &i); if (error) goto error1; /* Update the parent high keys of the left block, if needed. */ error = xfs_btree_update_keys(tcur, level); if (error) goto error1; xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); } /* Update the parent keys of the right block. */ error = xfs_btree_update_keys(cur, level); if (error) goto error0; /* Slide the cursor value left one. */ cur->bc_levels[level].ptr--; *stat = 1; return 0; out0: *stat = 0; return 0; error0: return error; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); return error; } /* * Move 1 record right from cur/level if possible. * Update cur to reflect the new path. */ STATIC int /* error */ xfs_btree_rshift( struct xfs_btree_cur *cur, int level, int *stat) /* success/failure */ { struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ struct xfs_buf *rbp; /* right buffer pointer */ struct xfs_btree_block *right; /* right btree block */ struct xfs_btree_cur *tcur; /* temporary btree cursor */ union xfs_btree_ptr rptr; /* right block pointer */ union xfs_btree_key *rkp; /* right btree key */ int rrecs; /* right record count */ int lrecs; /* left record count */ int error; /* error return value */ int i; /* loop counter */ if (xfs_btree_at_iroot(cur, level)) goto out0; /* Set up variables for this block as "left". */ left = xfs_btree_get_block(cur, level, &lbp); #ifdef DEBUG error = xfs_btree_check_block(cur, left, level, lbp); if (error) goto error0; #endif /* If we've got no right sibling then we can't shift an entry right. */ xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); if (xfs_btree_ptr_is_null(cur, &rptr)) goto out0; /* * If the cursor entry is the one that would be moved, don't * do it... it's too complicated. */ lrecs = xfs_btree_get_numrecs(left); if (cur->bc_levels[level].ptr >= lrecs) goto out0; /* Set up the right neighbor as "right". */ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; /* If it's full, it can't take another entry. */ rrecs = xfs_btree_get_numrecs(right); if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) goto out0; XFS_BTREE_STATS_INC(cur, rshift); XFS_BTREE_STATS_ADD(cur, moves, rrecs); /* * Make a hole at the start of the right neighbor block, then * copy the last left block entry to the hole. */ if (level > 0) { /* It's a nonleaf. make a hole in the keys and ptrs */ union xfs_btree_key *lkp; union xfs_btree_ptr *lpp; union xfs_btree_ptr *rpp; lkp = xfs_btree_key_addr(cur, lrecs, left); lpp = xfs_btree_ptr_addr(cur, lrecs, left); rkp = xfs_btree_key_addr(cur, 1, right); rpp = xfs_btree_ptr_addr(cur, 1, right); for (i = rrecs - 1; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, rpp, i, level); if (error) goto error0; } xfs_btree_shift_keys(cur, rkp, 1, rrecs); xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); error = xfs_btree_debug_check_ptr(cur, lpp, 0, level); if (error) goto error0; /* Now put the new data in, and log it. */ xfs_btree_copy_keys(cur, rkp, lkp, 1); xfs_btree_copy_ptrs(cur, rpp, lpp, 1); xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); ASSERT(cur->bc_ops->keys_inorder(cur, rkp, xfs_btree_key_addr(cur, 2, right))); } else { /* It's a leaf. make a hole in the records */ union xfs_btree_rec *lrp; union xfs_btree_rec *rrp; lrp = xfs_btree_rec_addr(cur, lrecs, left); rrp = xfs_btree_rec_addr(cur, 1, right); xfs_btree_shift_recs(cur, rrp, 1, rrecs); /* Now put the new data in, and log it. */ xfs_btree_copy_recs(cur, rrp, lrp, 1); xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); } /* * Decrement and log left's numrecs, bump and log right's numrecs. */ xfs_btree_set_numrecs(left, --lrecs); xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); xfs_btree_set_numrecs(right, ++rrecs); xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); /* * Using a temporary cursor, update the parent key values of the * block on the right. */ error = xfs_btree_dup_cursor(cur, &tcur); if (error) goto error0; i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } error = xfs_btree_increment(tcur, level, &i); if (error) goto error1; /* Update the parent high keys of the left block, if needed. */ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_update_keys(cur, level); if (error) goto error1; } /* Update the parent keys of the right block. */ error = xfs_btree_update_keys(tcur, level); if (error) goto error1; xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); *stat = 1; return 0; out0: *stat = 0; return 0; error0: return error; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); return error; } static inline int xfs_btree_alloc_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *hint_block, union xfs_btree_ptr *new_block, int *stat) { int error; /* * Don't allow block allocation for a staging cursor, because staging * cursors do not support regular btree modifications. * * Bulk loading uses a separate callback to obtain new blocks from a * preallocated list, which prevents ENOSPC failures during loading. */ if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { ASSERT(0); return -EFSCORRUPTED; } error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat); trace_xfs_btree_alloc_block(cur, new_block, *stat, error); return error; } /* * Split cur/level block in half. * Return new block number and the key to its first * record (to be inserted into parent). */ STATIC int /* error */ __xfs_btree_split( struct xfs_btree_cur *cur, int level, union xfs_btree_ptr *ptrp, union xfs_btree_key *key, struct xfs_btree_cur **curp, int *stat) /* success/failure */ { union xfs_btree_ptr lptr; /* left sibling block ptr */ struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ union xfs_btree_ptr rptr; /* right sibling block ptr */ struct xfs_buf *rbp; /* right buffer pointer */ struct xfs_btree_block *right; /* right btree block */ union xfs_btree_ptr rrptr; /* right-right sibling ptr */ struct xfs_buf *rrbp; /* right-right buffer pointer */ struct xfs_btree_block *rrblock; /* right-right btree block */ int lrecs; int rrecs; int src_index; int error; /* error return value */ int i; XFS_BTREE_STATS_INC(cur, split); /* Set up left block (current one). */ left = xfs_btree_get_block(cur, level, &lbp); #ifdef DEBUG error = xfs_btree_check_block(cur, left, level, lbp); if (error) goto error0; #endif xfs_btree_buf_to_ptr(cur, lbp, &lptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat); if (error) goto error0; if (*stat == 0) goto out0; XFS_BTREE_STATS_INC(cur, alloc); /* Set up the new block as "right". */ error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp); if (error) goto error0; /* Fill in the btree header for the new right block. */ xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); /* * Split the entries between the old and the new block evenly. * Make sure that if there's an odd number of entries now, that * each new block will have the same number of entries. */ lrecs = xfs_btree_get_numrecs(left); rrecs = lrecs / 2; if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) rrecs++; src_index = (lrecs - rrecs + 1); XFS_BTREE_STATS_ADD(cur, moves, rrecs); /* Adjust numrecs for the later get_*_keys() calls. */ lrecs -= rrecs; xfs_btree_set_numrecs(left, lrecs); xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); /* * Copy btree block entries from the left block over to the * new block, the right. Update the right block and log the * changes. */ if (level > 0) { /* It's a non-leaf. Move keys and pointers. */ union xfs_btree_key *lkp; /* left btree key */ union xfs_btree_ptr *lpp; /* left address pointer */ union xfs_btree_key *rkp; /* right btree key */ union xfs_btree_ptr *rpp; /* right address pointer */ lkp = xfs_btree_key_addr(cur, src_index, left); lpp = xfs_btree_ptr_addr(cur, src_index, left); rkp = xfs_btree_key_addr(cur, 1, right); rpp = xfs_btree_ptr_addr(cur, 1, right); for (i = src_index; i < rrecs; i++) { error = xfs_btree_debug_check_ptr(cur, lpp, i, level); if (error) goto error0; } /* Copy the keys & pointers to the new block. */ xfs_btree_copy_keys(cur, rkp, lkp, rrecs); xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); xfs_btree_log_keys(cur, rbp, 1, rrecs); xfs_btree_log_ptrs(cur, rbp, 1, rrecs); /* Stash the keys of the new block for later insertion. */ xfs_btree_get_node_keys(cur, right, key); } else { /* It's a leaf. Move records. */ union xfs_btree_rec *lrp; /* left record pointer */ union xfs_btree_rec *rrp; /* right record pointer */ lrp = xfs_btree_rec_addr(cur, src_index, left); rrp = xfs_btree_rec_addr(cur, 1, right); /* Copy records to the new block. */ xfs_btree_copy_recs(cur, rrp, lrp, rrecs); xfs_btree_log_recs(cur, rbp, 1, rrecs); /* Stash the keys of the new block for later insertion. */ xfs_btree_get_leaf_keys(cur, right, key); } /* * Find the left block number by looking in the buffer. * Adjust sibling pointers. */ xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); /* * If there's a block to the new block's right, make that block * point back to right instead of to left. */ if (!xfs_btree_ptr_is_null(cur, &rrptr)) { error = xfs_btree_read_buf_block(cur, &rrptr, 0, &rrblock, &rrbp); if (error) goto error0; xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); } /* Update the parent high keys of the left block, if needed. */ if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_update_keys(cur, level); if (error) goto error0; } /* * If the cursor is really in the right block, move it there. * If it's just pointing past the last entry in left, then we'll * insert there, so don't change anything in that case. */ if (cur->bc_levels[level].ptr > lrecs + 1) { xfs_btree_setbuf(cur, level, rbp); cur->bc_levels[level].ptr -= lrecs; } /* * If there are more levels, we'll need another cursor which refers * the right block, no matter where this cursor was. */ if (level + 1 < cur->bc_nlevels) { error = xfs_btree_dup_cursor(cur, curp); if (error) goto error0; (*curp)->bc_levels[level + 1].ptr++; } *ptrp = rptr; *stat = 1; return 0; out0: *stat = 0; return 0; error0: return error; } #ifdef __KERNEL__ struct xfs_btree_split_args { struct xfs_btree_cur *cur; int level; union xfs_btree_ptr *ptrp; union xfs_btree_key *key; struct xfs_btree_cur **curp; int *stat; /* success/failure */ int result; bool kswapd; /* allocation in kswapd context */ struct completion *done; struct work_struct work; }; /* * Stack switching interfaces for allocation */ static void xfs_btree_split_worker( struct work_struct *work) { struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); unsigned long pflags; unsigned long new_pflags = 0; /* * we are in a transaction context here, but may also be doing work * in kswapd context, and hence we may need to inherit that state * temporarily to ensure that we don't block waiting for memory reclaim * in any way. */ if (args->kswapd) new_pflags |= PF_MEMALLOC | PF_KSWAPD; current_set_flags_nested(&pflags, new_pflags); xfs_trans_set_context(args->cur->bc_tp); args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, args->key, args->curp, args->stat); xfs_trans_clear_context(args->cur->bc_tp); current_restore_flags_nested(&pflags, new_pflags); /* * Do not access args after complete() has run here. We don't own args * and the owner may run and free args before we return here. */ complete(args->done); } /* * BMBT split requests often come in with little stack to work on so we push * them off to a worker thread so there is lots of stack to use. For the other * btree types, just call directly to avoid the context switch overhead here. * * Care must be taken here - the work queue rescuer thread introduces potential * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new * AGFs to allocate blocks. A task being run by the rescuer could attempt to * lock an AGF that is already locked by a task queued to run by the rescuer, * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to * release it until the current thread it is running gains the lock. * * To avoid this issue, we only ever queue BMBT splits that don't have an AGF * already locked to allocate from. The only place that doesn't hold an AGF * locked is unwritten extent conversion at IO completion, but that has already * been offloaded to a worker thread and hence has no stack consumption issues * we have to worry about. */ STATIC int /* error */ xfs_btree_split( struct xfs_btree_cur *cur, int level, union xfs_btree_ptr *ptrp, union xfs_btree_key *key, struct xfs_btree_cur **curp, int *stat) /* success/failure */ { struct xfs_btree_split_args args; DECLARE_COMPLETION_ONSTACK(done); if (!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_tp->t_highest_agno == NULLAGNUMBER) return __xfs_btree_split(cur, level, ptrp, key, curp, stat); args.cur = cur; args.level = level; args.ptrp = ptrp; args.key = key; args.curp = curp; args.stat = stat; args.done = &done; args.kswapd = current_is_kswapd(); INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); queue_work(xfs_alloc_wq, &args.work); wait_for_completion(&done); destroy_work_on_stack(&args.work); return args.result; } #else #define xfs_btree_split __xfs_btree_split #endif /* __KERNEL__ */ /* * Copy the old inode root contents into a real block and make the * broot point to it. */ int /* error */ xfs_btree_new_iroot( struct xfs_btree_cur *cur, /* btree cursor */ int *logflags, /* logging flags for inode */ int *stat) /* return status - 0 fail */ { struct xfs_buf *cbp; /* buffer for cblock */ struct xfs_btree_block *block; /* btree block */ struct xfs_btree_block *cblock; /* child btree block */ union xfs_btree_key *ckp; /* child key pointer */ union xfs_btree_ptr *cpp; /* child ptr pointer */ union xfs_btree_key *kp; /* pointer to btree key */ union xfs_btree_ptr *pp; /* pointer to block addr */ union xfs_btree_ptr nptr; /* new block addr */ int level; /* btree level */ int error; /* error return code */ int i; /* loop counter */ XFS_BTREE_STATS_INC(cur, newroot); ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); level = cur->bc_nlevels - 1; block = xfs_btree_get_iroot(cur); pp = xfs_btree_ptr_addr(cur, 1, block); /* Allocate the new block. If we can't do it, we're toast. Give up. */ error = xfs_btree_alloc_block(cur, pp, &nptr, stat); if (error) goto error0; if (*stat == 0) return 0; XFS_BTREE_STATS_INC(cur, alloc); /* Copy the root into a real block. */ error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp); if (error) goto error0; /* * we can't just memcpy() the root in for CRC enabled btree blocks. * In that case have to also ensure the blkno remains correct */ memcpy(cblock, block, xfs_btree_block_len(cur)); if (xfs_has_crc(cur->bc_mp)) { __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) cblock->bb_u.l.bb_blkno = bno; else cblock->bb_u.s.bb_blkno = bno; } be16_add_cpu(&block->bb_level, 1); xfs_btree_set_numrecs(block, 1); cur->bc_nlevels++; ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); cur->bc_levels[level + 1].ptr = 1; kp = xfs_btree_key_addr(cur, 1, block); ckp = xfs_btree_key_addr(cur, 1, cblock); xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); cpp = xfs_btree_ptr_addr(cur, 1, cblock); for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) goto error0; } xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level); if (error) goto error0; xfs_btree_copy_ptrs(cur, pp, &nptr, 1); xfs_iroot_realloc(cur->bc_ino.ip, 1 - xfs_btree_get_numrecs(cblock), cur->bc_ino.whichfork); xfs_btree_setbuf(cur, level, cbp); /* * Do all this logging at the end so that * the root is at the right level. */ xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); *logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); *stat = 1; return 0; error0: return error; } static void xfs_btree_set_root( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int inc) { if (cur->bc_flags & XFS_BTREE_STAGING) { /* Update the btree root information for a per-AG fake root. */ cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s); cur->bc_ag.afake->af_levels += inc; } else { cur->bc_ops->set_root(cur, ptr, inc); } } /* * Allocate a new root block, fill it in. */ STATIC int /* error */ xfs_btree_new_root( struct xfs_btree_cur *cur, /* btree cursor */ int *stat) /* success/failure */ { struct xfs_btree_block *block; /* one half of the old root block */ struct xfs_buf *bp; /* buffer containing block */ int error; /* error return value */ struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ struct xfs_buf *nbp; /* new (root) buffer */ struct xfs_btree_block *new; /* new (root) btree block */ int nptr; /* new value for key index, 1 or 2 */ struct xfs_buf *rbp; /* right buffer pointer */ struct xfs_btree_block *right; /* right btree block */ union xfs_btree_ptr rptr; union xfs_btree_ptr lptr; XFS_BTREE_STATS_INC(cur, newroot); /* initialise our start point from the cursor */ xfs_btree_init_ptr_from_cur(cur, &rptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat); if (error) goto error0; if (*stat == 0) goto out0; XFS_BTREE_STATS_INC(cur, alloc); /* Set up the new block. */ error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp); if (error) goto error0; /* Set the root in the holding structure increasing the level by 1. */ xfs_btree_set_root(cur, &lptr, 1); /* * At the previous root level there are now two blocks: the old root, * and the new block generated when it was split. We don't know which * one the cursor is pointing at, so we set up variables "left" and * "right" for each case. */ block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); if (error) goto error0; #endif xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); if (!xfs_btree_ptr_is_null(cur, &rptr)) { /* Our block is left, pick up the right block. */ lbp = bp; xfs_btree_buf_to_ptr(cur, lbp, &lptr); left = block; error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); if (error) goto error0; bp = rbp; nptr = 1; } else { /* Our block is right, pick up the left block. */ rbp = bp; xfs_btree_buf_to_ptr(cur, rbp, &rptr); right = block; xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); if (error) goto error0; bp = lbp; nptr = 2; } /* Fill in the new block's btree header and log it. */ xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && !xfs_btree_ptr_is_null(cur, &rptr)); /* Fill in the key data in the new root. */ if (xfs_btree_get_level(left) > 0) { /* * Get the keys for the left block's keys and put them directly * in the parent block. Do the same for the right block. */ xfs_btree_get_node_keys(cur, left, xfs_btree_key_addr(cur, 1, new)); xfs_btree_get_node_keys(cur, right, xfs_btree_key_addr(cur, 2, new)); } else { /* * Get the keys for the left block's records and put them * directly in the parent block. Do the same for the right * block. */ xfs_btree_get_leaf_keys(cur, left, xfs_btree_key_addr(cur, 1, new)); xfs_btree_get_leaf_keys(cur, right, xfs_btree_key_addr(cur, 2, new)); } xfs_btree_log_keys(cur, nbp, 1, 2); /* Fill in the pointer data in the new root. */ xfs_btree_copy_ptrs(cur, xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); xfs_btree_copy_ptrs(cur, xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); xfs_btree_log_ptrs(cur, nbp, 1, 2); /* Fix up the cursor. */ xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); cur->bc_levels[cur->bc_nlevels].ptr = nptr; cur->bc_nlevels++; ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); *stat = 1; return 0; error0: return error; out0: *stat = 0; return 0; } STATIC int xfs_btree_make_block_unfull( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* btree level */ int numrecs,/* # of recs in block */ int *oindex,/* old tree index */ int *index, /* new tree index */ union xfs_btree_ptr *nptr, /* new btree ptr */ struct xfs_btree_cur **ncur, /* new btree cursor */ union xfs_btree_key *key, /* key of new block */ int *stat) { int error = 0; if (xfs_btree_at_iroot(cur, level)) { struct xfs_inode *ip = cur->bc_ino.ip; if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); *stat = 1; } else { /* A root block that needs replacing */ int logflags = 0; error = xfs_btree_new_iroot(cur, &logflags, stat); if (error || *stat == 0) return error; xfs_trans_log_inode(cur->bc_tp, ip, logflags); } return 0; } /* First, try shifting an entry to the right neighbor. */ error = xfs_btree_rshift(cur, level, stat); if (error || *stat) return error; /* Next, try shifting an entry to the left neighbor. */ error = xfs_btree_lshift(cur, level, stat); if (error) return error; if (*stat) { *oindex = *index = cur->bc_levels[level].ptr; return 0; } /* * Next, try splitting the current block in half. * * If this works we have to re-set our variables because we * could be in a different block now. */ error = xfs_btree_split(cur, level, nptr, key, ncur, stat); if (error || *stat == 0) return error; *index = cur->bc_levels[level].ptr; return 0; } /* * Insert one record/level. Return information to the caller * allowing the next level up to proceed if necessary. */ STATIC int xfs_btree_insrec( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level to insert record at */ union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ union xfs_btree_rec *rec, /* record to insert */ union xfs_btree_key *key, /* i/o: block key for ptrp */ struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ int *stat) /* success/failure */ { struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ int ptr; /* key/record index */ int numrecs;/* number of records */ int error; /* error return value */ int i; xfs_daddr_t old_bn; ncur = NULL; lkey = &nkey; /* * If we have an external root pointer, and we've made it to the * root level, allocate a new root block and we're done. */ if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE && level >= cur->bc_nlevels) { error = xfs_btree_new_root(cur, stat); xfs_btree_set_ptr_null(cur, ptrp); return error; } /* If we're off the left edge, return failure. */ ptr = cur->bc_levels[level].ptr; if (ptr == 0) { *stat = 0; return 0; } optr = ptr; XFS_BTREE_STATS_INC(cur, insrec); /* Get pointers to the btree buffer and block. */ block = xfs_btree_get_block(cur, level, &bp); old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; numrecs = xfs_btree_get_numrecs(block); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) goto error0; /* Check that the new entry is being inserted in the right place. */ if (ptr <= numrecs) { if (level == 0) { ASSERT(cur->bc_ops->recs_inorder(cur, rec, xfs_btree_rec_addr(cur, ptr, block))); } else { ASSERT(cur->bc_ops->keys_inorder(cur, key, xfs_btree_key_addr(cur, ptr, block))); } } #endif /* * If the block is full, we can't insert the new entry until we * make the block un-full. */ xfs_btree_set_ptr_null(cur, &nptr); if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { error = xfs_btree_make_block_unfull(cur, level, numrecs, &optr, &ptr, &nptr, &ncur, lkey, stat); if (error || *stat == 0) goto error0; } /* * The current block may have changed if the block was * previously full and we have just made space in it. */ block = xfs_btree_get_block(cur, level, &bp); numrecs = xfs_btree_get_numrecs(block); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) goto error0; #endif /* * At this point we know there's room for our new entry in the block * we're pointing at. */ XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); if (level > 0) { /* It's a nonleaf. make a hole in the keys and ptrs */ union xfs_btree_key *kp; union xfs_btree_ptr *pp; kp = xfs_btree_key_addr(cur, ptr, block); pp = xfs_btree_ptr_addr(cur, ptr, block); for (i = numrecs - ptr; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) goto error0; } xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level); if (error) goto error0; /* Now put the new data in, bump numrecs and log it. */ xfs_btree_copy_keys(cur, kp, key, 1); xfs_btree_copy_ptrs(cur, pp, ptrp, 1); numrecs++; xfs_btree_set_numrecs(block, numrecs); xfs_btree_log_ptrs(cur, bp, ptr, numrecs); xfs_btree_log_keys(cur, bp, ptr, numrecs); #ifdef DEBUG if (ptr < numrecs) { ASSERT(cur->bc_ops->keys_inorder(cur, kp, xfs_btree_key_addr(cur, ptr + 1, block))); } #endif } else { /* It's a leaf. make a hole in the records */ union xfs_btree_rec *rp; rp = xfs_btree_rec_addr(cur, ptr, block); xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); /* Now put the new data in, bump numrecs and log it. */ xfs_btree_copy_recs(cur, rp, rec, 1); xfs_btree_set_numrecs(block, ++numrecs); xfs_btree_log_recs(cur, bp, ptr, numrecs); #ifdef DEBUG if (ptr < numrecs) { ASSERT(cur->bc_ops->recs_inorder(cur, rp, xfs_btree_rec_addr(cur, ptr + 1, block))); } #endif } /* Log the new number of records in the btree header. */ xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); /* * Update btree keys to reflect the newly added record or keyptr. * There are three cases here to be aware of. Normally, all we have to * do is walk towards the root, updating keys as necessary. * * If the caller had us target a full block for the insertion, we dealt * with that by calling the _make_block_unfull function. If the * "make unfull" function splits the block, it'll hand us back the key * and pointer of the new block. We haven't yet added the new block to * the next level up, so if we decide to add the new record to the new * block (bp->b_bn != old_bn), we have to update the caller's pointer * so that the caller adds the new block with the correct key. * * However, there is a third possibility-- if the selected block is the * root block of an inode-rooted btree and cannot be expanded further, * the "make unfull" function moves the root block contents to a new * block and updates the root block to point to the new block. In this * case, no block pointer is passed back because the block has already * been added to the btree. In this case, we need to use the regular * key update function, just like the first case. This is critical for * overlapping btrees, because the high key must be updated to reflect * the entire tree, not just the subtree accessible through the first * child of the root (which is now two levels down from the root). */ if (!xfs_btree_ptr_is_null(cur, &nptr) && bp && xfs_buf_daddr(bp) != old_bn) { xfs_btree_get_keys(cur, block, lkey); } else if (xfs_btree_needs_key_update(cur, optr)) { error = xfs_btree_update_keys(cur, level); if (error) goto error0; } /* * Return the new block number, if any. * If there is one, give back a record value and a cursor too. */ *ptrp = nptr; if (!xfs_btree_ptr_is_null(cur, &nptr)) { xfs_btree_copy_keys(cur, key, lkey, 1); *curp = ncur; } *stat = 1; return 0; error0: if (ncur) xfs_btree_del_cursor(ncur, error); return error; } /* * Insert the record at the point referenced by cur. * * A multi-level split of the tree on insert will invalidate the original * cursor. All callers of this function should assume that the cursor is * no longer valid and revalidate it. */ int xfs_btree_insert( struct xfs_btree_cur *cur, int *stat) { int error; /* error return value */ int i; /* result value, 0 for failure */ int level; /* current level number in btree */ union xfs_btree_ptr nptr; /* new block number (split result) */ struct xfs_btree_cur *ncur; /* new cursor (split result) */ struct xfs_btree_cur *pcur; /* previous level's cursor */ union xfs_btree_key bkey; /* key of block to insert */ union xfs_btree_key *key; union xfs_btree_rec rec; /* record to insert */ level = 0; ncur = NULL; pcur = cur; key = &bkey; xfs_btree_set_ptr_null(cur, &nptr); /* Make a key out of the record data to be inserted, and save it. */ cur->bc_ops->init_rec_from_cur(cur, &rec); cur->bc_ops->init_key_from_rec(key, &rec); /* * Loop going up the tree, starting at the leaf level. * Stop when we don't get a split block, that must mean that * the insert is finished with this level. */ do { /* * Insert nrec/nptr into this level of the tree. * Note if we fail, nptr will be null. */ error = xfs_btree_insrec(pcur, level, &nptr, &rec, key, &ncur, &i); if (error) { if (pcur != cur) xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); goto error0; } if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } level++; /* * See if the cursor we just used is trash. * Can't trash the caller's cursor, but otherwise we should * if ncur is a new cursor or we're about to be done. */ if (pcur != cur && (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { /* Save the state from the cursor before we trash it */ if (cur->bc_ops->update_cursor && !(cur->bc_flags & XFS_BTREE_STAGING)) cur->bc_ops->update_cursor(pcur, cur); cur->bc_nlevels = pcur->bc_nlevels; xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); } /* If we got a new cursor, switch to it. */ if (ncur) { pcur = ncur; ncur = NULL; } } while (!xfs_btree_ptr_is_null(cur, &nptr)); *stat = i; return 0; error0: return error; } /* * Try to merge a non-leaf block back into the inode root. * * Note: the killroot names comes from the fact that we're effectively * killing the old root block. But because we can't just delete the * inode we have to copy the single block it was pointing to into the * inode. */ STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { int whichfork = cur->bc_ino.whichfork; struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; union xfs_btree_key *kp; union xfs_btree_key *ckp; union xfs_btree_ptr *pp; union xfs_btree_ptr *cpp; struct xfs_buf *cbp; int level; int index; int numrecs; int error; #ifdef DEBUG union xfs_btree_ptr ptr; #endif int i; ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); ASSERT(cur->bc_nlevels > 1); /* * Don't deal with the root block needs to be a leaf case. * We're just going to turn the thing back into extents anyway. */ level = cur->bc_nlevels - 1; if (level == 1) goto out0; /* * Give up if the root has multiple children. */ block = xfs_btree_get_iroot(cur); if (xfs_btree_get_numrecs(block) != 1) goto out0; cblock = xfs_btree_get_block(cur, level - 1, &cbp); numrecs = xfs_btree_get_numrecs(cblock); /* * Only do this if the next level will fit. * Then the data must be copied up to the inode, * instead of freeing the root you free the next level. */ if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) goto out0; XFS_BTREE_STATS_INC(cur, killroot); #ifdef DEBUG xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); #endif index = numrecs - cur->bc_ops->get_maxrecs(cur, level); if (index) { xfs_iroot_realloc(cur->bc_ino.ip, index, cur->bc_ino.whichfork); block = ifp->if_broot; } be16_add_cpu(&block->bb_numrecs, index); ASSERT(block->bb_numrecs == cblock->bb_numrecs); kp = xfs_btree_key_addr(cur, 1, block); ckp = xfs_btree_key_addr(cur, 1, cblock); xfs_btree_copy_keys(cur, kp, ckp, numrecs); pp = xfs_btree_ptr_addr(cur, 1, block); cpp = xfs_btree_ptr_addr(cur, 1, cblock); for (i = 0; i < numrecs; i++) { error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1); if (error) return error; } xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); error = xfs_btree_free_block(cur, cbp); if (error) return error; cur->bc_levels[level - 1].bp = NULL; be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); cur->bc_nlevels--; out0: return 0; } /* * Kill the current root node, and replace it with it's only child node. */ STATIC int xfs_btree_kill_root( struct xfs_btree_cur *cur, struct xfs_buf *bp, int level, union xfs_btree_ptr *newroot) { int error; XFS_BTREE_STATS_INC(cur, killroot); /* * Update the root pointer, decreasing the level by 1 and then * free the old root. */ xfs_btree_set_root(cur, newroot, -1); error = xfs_btree_free_block(cur, bp); if (error) return error; cur->bc_levels[level].bp = NULL; cur->bc_levels[level].ra = 0; cur->bc_nlevels--; return 0; } STATIC int xfs_btree_dec_cursor( struct xfs_btree_cur *cur, int level, int *stat) { int error; int i; if (level > 0) { error = xfs_btree_decrement(cur, level, &i); if (error) return error; } *stat = 1; return 0; } /* * Single level of the btree record deletion routine. * Delete record pointed to by cur/level. * Remove the record from its block then rebalance the tree. * Return 0 for error, 1 for done, 2 to go on to the next level. */ STATIC int /* error */ xfs_btree_delrec( struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level removing record from */ int *stat) /* fail/done/go-on */ { struct xfs_btree_block *block; /* btree block */ union xfs_btree_ptr cptr; /* current block ptr */ struct xfs_buf *bp; /* buffer for block */ int error; /* error return value */ int i; /* loop counter */ union xfs_btree_ptr lptr; /* left sibling block ptr */ struct xfs_buf *lbp; /* left buffer pointer */ struct xfs_btree_block *left; /* left btree block */ int lrecs = 0; /* left record count */ int ptr; /* key/record index */ union xfs_btree_ptr rptr; /* right sibling block ptr */ struct xfs_buf *rbp; /* right buffer pointer */ struct xfs_btree_block *right; /* right btree block */ struct xfs_btree_block *rrblock; /* right-right btree block */ struct xfs_buf *rrbp; /* right-right buffer pointer */ int rrecs = 0; /* right record count */ struct xfs_btree_cur *tcur; /* temporary btree cursor */ int numrecs; /* temporary numrec count */ tcur = NULL; /* Get the index of the entry being deleted, check for nothing there. */ ptr = cur->bc_levels[level].ptr; if (ptr == 0) { *stat = 0; return 0; } /* Get the buffer & block containing the record or key/ptr. */ block = xfs_btree_get_block(cur, level, &bp); numrecs = xfs_btree_get_numrecs(block); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) goto error0; #endif /* Fail if we're off the end of the block. */ if (ptr > numrecs) { *stat = 0; return 0; } XFS_BTREE_STATS_INC(cur, delrec); XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); /* Excise the entries being deleted. */ if (level > 0) { /* It's a nonleaf. operate on keys and ptrs */ union xfs_btree_key *lkp; union xfs_btree_ptr *lpp; lkp = xfs_btree_key_addr(cur, ptr + 1, block); lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); for (i = 0; i < numrecs - ptr; i++) { error = xfs_btree_debug_check_ptr(cur, lpp, i, level); if (error) goto error0; } if (ptr < numrecs) { xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); } } else { /* It's a leaf. operate on records */ if (ptr < numrecs) { xfs_btree_shift_recs(cur, xfs_btree_rec_addr(cur, ptr + 1, block), -1, numrecs - ptr); xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); } } /* * Decrement and log the number of entries in the block. */ xfs_btree_set_numrecs(block, --numrecs); xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); /* * We're at the root level. First, shrink the root block in-memory. * Try to get rid of the next level down. If we can't then there's * nothing left to do. */ if (xfs_btree_at_iroot(cur, level)) { xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); error = xfs_btree_kill_iroot(cur); if (error) goto error0; error = xfs_btree_dec_cursor(cur, level, stat); if (error) goto error0; *stat = 1; return 0; } /* * If this is the root level, and there's only one entry left, and it's * NOT the leaf level, then we can get rid of this level. */ if (level == cur->bc_nlevels - 1) { if (numrecs == 1 && level > 0) { union xfs_btree_ptr *pp; /* * pp is still set to the first pointer in the block. * Make it the new root of the btree. */ pp = xfs_btree_ptr_addr(cur, 1, block); error = xfs_btree_kill_root(cur, bp, level, pp); if (error) goto error0; } else if (level > 0) { error = xfs_btree_dec_cursor(cur, level, stat); if (error) goto error0; } *stat = 1; return 0; } /* * If we deleted the leftmost entry in the block, update the * key values above us in the tree. */ if (xfs_btree_needs_key_update(cur, ptr)) { error = xfs_btree_update_keys(cur, level); if (error) goto error0; } /* * If the number of records remaining in the block is at least * the minimum, we're done. */ if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { error = xfs_btree_dec_cursor(cur, level, stat); if (error) goto error0; return 0; } /* * Otherwise, we have to move some records around to keep the * tree balanced. Look at the left and right sibling blocks to * see if we can re-balance by moving only one record. */ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { /* * One child of root, need to get a chance to copy its contents * into the root and delete it. Can't go up to next level, * there's nothing to delete there. */ if (xfs_btree_ptr_is_null(cur, &rptr) && xfs_btree_ptr_is_null(cur, &lptr) && level == cur->bc_nlevels - 2) { error = xfs_btree_kill_iroot(cur); if (!error) error = xfs_btree_dec_cursor(cur, level, stat); if (error) goto error0; return 0; } } ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || !xfs_btree_ptr_is_null(cur, &lptr)); /* * Duplicate the cursor so our btree manipulations here won't * disrupt the next level up. */ error = xfs_btree_dup_cursor(cur, &tcur); if (error) goto error0; /* * If there's a right sibling, see if it's ok to shift an entry * out of it. */ if (!xfs_btree_ptr_is_null(cur, &rptr)) { /* * Move the temp cursor to the last entry in the next block. * Actually any entry but the first would suffice. */ i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } error = xfs_btree_increment(tcur, level, &i); if (error) goto error0; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } /* Grab a pointer to the block. */ right = xfs_btree_get_block(tcur, level, &rbp); #ifdef DEBUG error = xfs_btree_check_block(tcur, right, level, rbp); if (error) goto error0; #endif /* Grab the current block number, for future use. */ xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); /* * If right block is full enough so that removing one entry * won't make it too empty, and left-shifting an entry out * of right to us works, we're done. */ if (xfs_btree_get_numrecs(right) - 1 >= cur->bc_ops->get_minrecs(tcur, level)) { error = xfs_btree_lshift(tcur, level, &i); if (error) goto error0; if (i) { ASSERT(xfs_btree_get_numrecs(block) >= cur->bc_ops->get_minrecs(tcur, level)); xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); tcur = NULL; error = xfs_btree_dec_cursor(cur, level, stat); |