850 820 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 | // SPDX-License-Identifier: GPL-2.0-only /* * Generic HDLC support routines for Linux * * Copyright (C) 1999 - 2008 Krzysztof Halasa <khc@pm.waw.pl> * * Currently supported: * * raw IP-in-HDLC * * Cisco HDLC * * Frame Relay with ANSI or CCITT LMI (both user and network side) * * PPP * * X.25 * * Use sethdlc utility to set line parameters, protocol and PVCs * * How does it work: * - proto->open(), close(), start(), stop() calls are serialized. * The order is: open, [ start, stop ... ] close ... * - proto->start() and stop() are called with spin_lock_irq held. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/hdlc.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/pkt_sched.h> #include <linux/poll.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/net_namespace.h> static const char *version = "HDLC support module revision 1.22"; #undef DEBUG_LINK static struct hdlc_proto *first_proto; static int hdlc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *p, struct net_device *orig_dev) { struct hdlc_device *hdlc; /* First make sure "dev" is an HDLC device */ if (!(dev->priv_flags & IFF_WAN_HDLC)) { kfree_skb(skb); return NET_RX_SUCCESS; } hdlc = dev_to_hdlc(dev); if (!net_eq(dev_net(dev), &init_net)) { kfree_skb(skb); return 0; } BUG_ON(!hdlc->proto->netif_rx); return hdlc->proto->netif_rx(skb); } netdev_tx_t hdlc_start_xmit(struct sk_buff *skb, struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); if (hdlc->proto->xmit) return hdlc->proto->xmit(skb, dev); return hdlc->xmit(skb, dev); /* call hardware driver directly */ } EXPORT_SYMBOL(hdlc_start_xmit); static inline void hdlc_proto_start(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); if (hdlc->proto->start) hdlc->proto->start(dev); } static inline void hdlc_proto_stop(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); if (hdlc->proto->stop) hdlc->proto->stop(dev); } static int hdlc_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); hdlc_device *hdlc; unsigned long flags; int on; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (!(dev->priv_flags & IFF_WAN_HDLC)) return NOTIFY_DONE; /* not an HDLC device */ if (event != NETDEV_CHANGE) return NOTIFY_DONE; /* Only interested in carrier changes */ on = netif_carrier_ok(dev); #ifdef DEBUG_LINK printk(KERN_DEBUG "%s: hdlc_device_event NETDEV_CHANGE, carrier %i\n", dev->name, on); #endif hdlc = dev_to_hdlc(dev); spin_lock_irqsave(&hdlc->state_lock, flags); if (hdlc->carrier == on) goto carrier_exit; /* no change in DCD line level */ hdlc->carrier = on; if (!hdlc->open) goto carrier_exit; if (hdlc->carrier) { netdev_info(dev, "Carrier detected\n"); hdlc_proto_start(dev); } else { netdev_info(dev, "Carrier lost\n"); hdlc_proto_stop(dev); } carrier_exit: spin_unlock_irqrestore(&hdlc->state_lock, flags); return NOTIFY_DONE; } /* Must be called by hardware driver when HDLC device is being opened */ int hdlc_open(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); #ifdef DEBUG_LINK printk(KERN_DEBUG "%s: hdlc_open() carrier %i open %i\n", dev->name, hdlc->carrier, hdlc->open); #endif if (!hdlc->proto) return -ENOSYS; /* no protocol attached */ if (hdlc->proto->open) { int result = hdlc->proto->open(dev); if (result) return result; } spin_lock_irq(&hdlc->state_lock); if (hdlc->carrier) { netdev_info(dev, "Carrier detected\n"); hdlc_proto_start(dev); } else { netdev_info(dev, "No carrier\n"); } hdlc->open = 1; spin_unlock_irq(&hdlc->state_lock); return 0; } EXPORT_SYMBOL(hdlc_open); /* Must be called by hardware driver when HDLC device is being closed */ void hdlc_close(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); #ifdef DEBUG_LINK printk(KERN_DEBUG "%s: hdlc_close() carrier %i open %i\n", dev->name, hdlc->carrier, hdlc->open); #endif spin_lock_irq(&hdlc->state_lock); hdlc->open = 0; if (hdlc->carrier) hdlc_proto_stop(dev); spin_unlock_irq(&hdlc->state_lock); if (hdlc->proto->close) hdlc->proto->close(dev); } EXPORT_SYMBOL(hdlc_close); int hdlc_ioctl(struct net_device *dev, struct if_settings *ifs) { struct hdlc_proto *proto = first_proto; int result; if (dev_to_hdlc(dev)->proto) { result = dev_to_hdlc(dev)->proto->ioctl(dev, ifs); if (result != -EINVAL) return result; } /* Not handled by currently attached protocol (if any) */ while (proto) { result = proto->ioctl(dev, ifs); if (result != -EINVAL) return result; proto = proto->next; } return -EINVAL; } EXPORT_SYMBOL(hdlc_ioctl); static const struct header_ops hdlc_null_ops; static void hdlc_setup_dev(struct net_device *dev) { /* Re-init all variables changed by HDLC protocol drivers, * including ether_setup() called from hdlc_raw_eth.c. */ dev->flags = IFF_POINTOPOINT | IFF_NOARP; dev->priv_flags = IFF_WAN_HDLC; dev->mtu = HDLC_MAX_MTU; dev->min_mtu = 68; dev->max_mtu = HDLC_MAX_MTU; dev->type = ARPHRD_RAWHDLC; dev->hard_header_len = 0; dev->needed_headroom = 0; dev->addr_len = 0; dev->header_ops = &hdlc_null_ops; } static void hdlc_setup(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); hdlc_setup_dev(dev); hdlc->carrier = 1; hdlc->open = 0; spin_lock_init(&hdlc->state_lock); } struct net_device *alloc_hdlcdev(void *priv) { struct net_device *dev; dev = alloc_netdev(sizeof(struct hdlc_device), "hdlc%d", NET_NAME_UNKNOWN, hdlc_setup); if (dev) dev_to_hdlc(dev)->priv = priv; return dev; } EXPORT_SYMBOL(alloc_hdlcdev); void unregister_hdlc_device(struct net_device *dev) { rtnl_lock(); detach_hdlc_protocol(dev); unregister_netdevice(dev); rtnl_unlock(); } EXPORT_SYMBOL(unregister_hdlc_device); int attach_hdlc_protocol(struct net_device *dev, struct hdlc_proto *proto, size_t size) { int err; err = detach_hdlc_protocol(dev); if (err) return err; if (!try_module_get(proto->module)) return -ENOSYS; if (size) { dev_to_hdlc(dev)->state = kmalloc(size, GFP_KERNEL); if (!dev_to_hdlc(dev)->state) { module_put(proto->module); return -ENOBUFS; } } dev_to_hdlc(dev)->proto = proto; return 0; } EXPORT_SYMBOL(attach_hdlc_protocol); int detach_hdlc_protocol(struct net_device *dev) { hdlc_device *hdlc = dev_to_hdlc(dev); int err; if (hdlc->proto) { err = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, dev); err = notifier_to_errno(err); if (err) { netdev_err(dev, "Refused to change device type\n"); return err; } if (hdlc->proto->detach) hdlc->proto->detach(dev); module_put(hdlc->proto->module); hdlc->proto = NULL; } kfree(hdlc->state); hdlc->state = NULL; hdlc_setup_dev(dev); return 0; } EXPORT_SYMBOL(detach_hdlc_protocol); void register_hdlc_protocol(struct hdlc_proto *proto) { rtnl_lock(); proto->next = first_proto; first_proto = proto; rtnl_unlock(); } EXPORT_SYMBOL(register_hdlc_protocol); void unregister_hdlc_protocol(struct hdlc_proto *proto) { struct hdlc_proto **p; rtnl_lock(); p = &first_proto; while (*p != proto) { BUG_ON(!*p); p = &((*p)->next); } *p = proto->next; rtnl_unlock(); } EXPORT_SYMBOL(unregister_hdlc_protocol); MODULE_AUTHOR("Krzysztof Halasa <khc@pm.waw.pl>"); MODULE_DESCRIPTION("HDLC support module"); MODULE_LICENSE("GPL v2"); static struct packet_type hdlc_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_HDLC), .func = hdlc_rcv, }; static struct notifier_block hdlc_notifier = { .notifier_call = hdlc_device_event, }; static int __init hdlc_module_init(void) { int result; pr_info("%s\n", version); result = register_netdevice_notifier(&hdlc_notifier); if (result) return result; dev_add_pack(&hdlc_packet_type); return 0; } static void __exit hdlc_module_exit(void) { dev_remove_pack(&hdlc_packet_type); unregister_netdevice_notifier(&hdlc_notifier); } module_init(hdlc_module_init); module_exit(hdlc_module_exit); |
160 46 309 57 6 6 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_NULLS_H #define _LINUX_RCULIST_NULLS_H #ifdef __KERNEL__ /* * RCU-protected list version */ #include <linux/list_nulls.h> #include <linux/rcupdate.h> /** * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * * Note: hlist_nulls_unhashed() on the node return true after this. It is * useful for RCU based read lockfree traversal if the writer side * must know if the list entry is still hashed or already unhashed. * * In particular, it means that we can not poison the forward pointers * that may still be used for walking the hash list and we can only * zero the pprev pointer so list_unhashed() will return true after * this. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another * list-mutation primitive, such as hlist_nulls_add_head_rcu() or * hlist_nulls_del_rcu(), running on this same list. However, it is * perfectly legal to run concurrently with the _rcu list-traversal * primitives, such as hlist_nulls_for_each_entry_rcu(). */ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) { if (!hlist_nulls_unhashed(n)) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, NULL); } } /** * hlist_nulls_first_rcu - returns the first element of the hash list. * @head: the head of the list. */ #define hlist_nulls_first_rcu(head) \ (*((struct hlist_nulls_node __rcu __force **)&(head)->first)) /** * hlist_nulls_next_rcu - returns the element of the list after @node. * @node: element of the list. */ #define hlist_nulls_next_rcu(node) \ (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) /** * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: hlist_nulls_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry(). */ static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_nulls_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_nulls, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *first = h->first; WRITE_ONCE(n->next, first); WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) WRITE_ONCE(first->pprev, &n->next); } /** * hlist_nulls_add_tail_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist_nulls, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() * or hlist_nulls_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *i, *last = NULL; /* Note: write side code, so rcu accessors are not needed. */ for (i = h->first; !is_a_nulls(i); i = i->next) last = i; if (last) { WRITE_ONCE(n->next, last->next); n->pprev = &last->next; rcu_assign_pointer(hlist_nulls_next_rcu(last), n); } else { hlist_nulls_add_head_rcu(n, h); } } /* after that hlist_nulls_del will work */ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) { n->pprev = &n->next; n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); } /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. * * The barrier() is needed to make sure compiler doesn't cache first element [1], * as this loop can be restarted [2] * [1] Documentation/memory-barriers.txt around line 1533 * [2] Documentation/RCU/rculist_nulls.rst around line 146 */ #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ for (({barrier();}), \ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) /** * hlist_nulls_for_each_entry_safe - * iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_nulls_node to use as a loop cursor. * @head: the head of the list. * @member: the name of the hlist_nulls_node within the struct. */ #define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \ for (({barrier();}), \ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });) #endif #endif |
9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/types.h> #include <linux/module.h> #include <linux/crc64.h> #include <linux/err.h> #include <linux/init.h> #include <crypto/hash.h> #include <crypto/algapi.h> #include <linux/static_key.h> #include <linux/notifier.h> static struct crypto_shash __rcu *crc64_rocksoft_tfm; static DEFINE_STATIC_KEY_TRUE(crc64_rocksoft_fallback); static DEFINE_MUTEX(crc64_rocksoft_mutex); static struct work_struct crc64_rocksoft_rehash_work; static int crc64_rocksoft_notify(struct notifier_block *self, unsigned long val, void *data) { struct crypto_alg *alg = data; if (val != CRYPTO_MSG_ALG_LOADED || strcmp(alg->cra_name, CRC64_ROCKSOFT_STRING)) return NOTIFY_DONE; schedule_work(&crc64_rocksoft_rehash_work); return NOTIFY_OK; } static void crc64_rocksoft_rehash(struct work_struct *work) { struct crypto_shash *new, *old; mutex_lock(&crc64_rocksoft_mutex); old = rcu_dereference_protected(crc64_rocksoft_tfm, lockdep_is_held(&crc64_rocksoft_mutex)); new = crypto_alloc_shash(CRC64_ROCKSOFT_STRING, 0, 0); if (IS_ERR(new)) { mutex_unlock(&crc64_rocksoft_mutex); return; } rcu_assign_pointer(crc64_rocksoft_tfm, new); mutex_unlock(&crc64_rocksoft_mutex); if (old) { synchronize_rcu(); crypto_free_shash(old); } else { static_branch_disable(&crc64_rocksoft_fallback); } } static struct notifier_block crc64_rocksoft_nb = { .notifier_call = crc64_rocksoft_notify, }; u64 crc64_rocksoft_update(u64 crc, const unsigned char *buffer, size_t len) { struct { struct shash_desc shash; u64 crc; } desc; int err; if (static_branch_unlikely(&crc64_rocksoft_fallback)) return crc64_rocksoft_generic(crc, buffer, len); rcu_read_lock(); desc.shash.tfm = rcu_dereference(crc64_rocksoft_tfm); desc.crc = crc; err = crypto_shash_update(&desc.shash, buffer, len); rcu_read_unlock(); BUG_ON(err); return desc.crc; } EXPORT_SYMBOL_GPL(crc64_rocksoft_update); u64 crc64_rocksoft(const unsigned char *buffer, size_t len) { return crc64_rocksoft_update(0, buffer, len); } EXPORT_SYMBOL_GPL(crc64_rocksoft); static int __init crc64_rocksoft_mod_init(void) { INIT_WORK(&crc64_rocksoft_rehash_work, crc64_rocksoft_rehash); crypto_register_notifier(&crc64_rocksoft_nb); crc64_rocksoft_rehash(&crc64_rocksoft_rehash_work); return 0; } static void __exit crc64_rocksoft_mod_fini(void) { crypto_unregister_notifier(&crc64_rocksoft_nb); cancel_work_sync(&crc64_rocksoft_rehash_work); crypto_free_shash(rcu_dereference_protected(crc64_rocksoft_tfm, 1)); } module_init(crc64_rocksoft_mod_init); module_exit(crc64_rocksoft_mod_fini); static int crc64_rocksoft_transform_show(char *buffer, const struct kernel_param *kp) { struct crypto_shash *tfm; int len; if (static_branch_unlikely(&crc64_rocksoft_fallback)) return sprintf(buffer, "fallback\n"); rcu_read_lock(); tfm = rcu_dereference(crc64_rocksoft_tfm); len = snprintf(buffer, PAGE_SIZE, "%s\n", crypto_shash_driver_name(tfm)); rcu_read_unlock(); return len; } module_param_call(transform, NULL, crc64_rocksoft_transform_show, NULL, 0444); MODULE_AUTHOR("Keith Busch <kbusch@kernel.org>"); MODULE_DESCRIPTION("Rocksoft model CRC64 calculation (library API)"); MODULE_LICENSE("GPL"); MODULE_SOFTDEP("pre: crc64"); |
1 2 3 3 3 6 2 1 3 3 7 6 1 6 1 5 1 1 2 3 2 1 1 2 6 1 1 4 1 3 1 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 | // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <net/ip.h> #include <net/tcp.h> #include <net/netlink.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_synproxy.h> #include <linux/netfilter/nf_tables.h> #include <linux/netfilter/nf_synproxy.h> struct nft_synproxy { struct nf_synproxy_info info; }; static const struct nla_policy nft_synproxy_policy[NFTA_SYNPROXY_MAX + 1] = { [NFTA_SYNPROXY_MSS] = { .type = NLA_U16 }, [NFTA_SYNPROXY_WSCALE] = { .type = NLA_U8 }, [NFTA_SYNPROXY_FLAGS] = { .type = NLA_U32 }, }; static void nft_synproxy_tcp_options(struct synproxy_options *opts, const struct tcphdr *tcp, struct synproxy_net *snet, struct nf_synproxy_info *info, const struct nft_synproxy *priv) { this_cpu_inc(snet->stats->syn_received); if (tcp->ece && tcp->cwr) opts->options |= NF_SYNPROXY_OPT_ECN; opts->options &= priv->info.options; opts->mss_encode = opts->mss_option; opts->mss_option = info->mss; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, opts); else opts->options &= ~(NF_SYNPROXY_OPT_WSCALE | NF_SYNPROXY_OPT_SACK_PERM | NF_SYNPROXY_OPT_ECN); } static void nft_synproxy_eval_v4(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt, const struct tcphdr *tcp, struct tcphdr *_tcph, struct synproxy_options *opts) { struct nf_synproxy_info info = priv->info; struct net *net = nft_net(pkt); struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *skb = pkt->skb; if (tcp->syn) { /* Initial SYN from client */ nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); synproxy_send_client_synack(net, skb, tcp, opts); consume_skb(skb); regs->verdict.code = NF_STOLEN; } else if (tcp->ack) { /* ACK from client */ if (synproxy_recv_client_ack(net, skb, tcp, opts, ntohl(tcp->seq))) { consume_skb(skb); regs->verdict.code = NF_STOLEN; } else { regs->verdict.code = NF_DROP; } } } #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) static void nft_synproxy_eval_v6(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt, const struct tcphdr *tcp, struct tcphdr *_tcph, struct synproxy_options *opts) { struct nf_synproxy_info info = priv->info; struct net *net = nft_net(pkt); struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *skb = pkt->skb; if (tcp->syn) { /* Initial SYN from client */ nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); synproxy_send_client_synack_ipv6(net, skb, tcp, opts); consume_skb(skb); regs->verdict.code = NF_STOLEN; } else if (tcp->ack) { /* ACK from client */ if (synproxy_recv_client_ack_ipv6(net, skb, tcp, opts, ntohl(tcp->seq))) { consume_skb(skb); regs->verdict.code = NF_STOLEN; } else { regs->verdict.code = NF_DROP; } } } #endif /* CONFIG_NF_TABLES_IPV6*/ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct synproxy_options opts = {}; struct sk_buff *skb = pkt->skb; int thoff = nft_thoff(pkt); const struct tcphdr *tcp; struct tcphdr _tcph; if (pkt->tprot != IPPROTO_TCP) { regs->verdict.code = NFT_BREAK; return; } if (nf_ip_checksum(skb, nft_hook(pkt), thoff, IPPROTO_TCP)) { regs->verdict.code = NF_DROP; return; } tcp = skb_header_pointer(skb, thoff, sizeof(struct tcphdr), &_tcph); if (!tcp) { regs->verdict.code = NF_DROP; return; } if (!synproxy_parse_options(skb, thoff, tcp, &opts)) { regs->verdict.code = NF_DROP; return; } switch (skb->protocol) { case htons(ETH_P_IP): nft_synproxy_eval_v4(priv, regs, pkt, tcp, &_tcph, &opts); return; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case htons(ETH_P_IPV6): nft_synproxy_eval_v6(priv, regs, pkt, tcp, &_tcph, &opts); return; #endif } regs->verdict.code = NFT_BREAK; } static int nft_synproxy_do_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_synproxy *priv) { struct synproxy_net *snet = synproxy_pernet(ctx->net); u32 flags; int err; if (tb[NFTA_SYNPROXY_MSS]) priv->info.mss = ntohs(nla_get_be16(tb[NFTA_SYNPROXY_MSS])); if (tb[NFTA_SYNPROXY_WSCALE]) priv->info.wscale = nla_get_u8(tb[NFTA_SYNPROXY_WSCALE]); if (tb[NFTA_SYNPROXY_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_SYNPROXY_FLAGS])); if (flags & ~NF_SYNPROXY_OPT_MASK) return -EOPNOTSUPP; priv->info.options = flags; } err = nf_ct_netns_get(ctx->net, ctx->family); if (err) return err; switch (ctx->family) { case NFPROTO_IPV4: err = nf_synproxy_ipv4_init(snet, ctx->net); if (err) goto nf_ct_failure; break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: err = nf_synproxy_ipv6_init(snet, ctx->net); if (err) goto nf_ct_failure; break; #endif case NFPROTO_INET: err = nf_synproxy_ipv4_init(snet, ctx->net); if (err) goto nf_ct_failure; err = nf_synproxy_ipv6_init(snet, ctx->net); if (err) { nf_synproxy_ipv4_fini(snet, ctx->net); goto nf_ct_failure; } break; } return 0; nf_ct_failure: nf_ct_netns_put(ctx->net, ctx->family); return err; } static void nft_synproxy_do_destroy(const struct nft_ctx *ctx) { struct synproxy_net *snet = synproxy_pernet(ctx->net); switch (ctx->family) { case NFPROTO_IPV4: nf_synproxy_ipv4_fini(snet, ctx->net); break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: nf_synproxy_ipv6_fini(snet, ctx->net); break; #endif case NFPROTO_INET: nf_synproxy_ipv4_fini(snet, ctx->net); nf_synproxy_ipv6_fini(snet, ctx->net); break; } nf_ct_netns_put(ctx->net, ctx->family); } static int nft_synproxy_do_dump(struct sk_buff *skb, struct nft_synproxy *priv) { if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) || nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) || nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_synproxy_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_synproxy *priv = nft_expr_priv(expr); nft_synproxy_do_eval(priv, regs, pkt); } static int nft_synproxy_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD)); } static int nft_synproxy_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_synproxy *priv = nft_expr_priv(expr); return nft_synproxy_do_init(ctx, tb, priv); } static void nft_synproxy_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nft_synproxy_do_destroy(ctx); } static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_synproxy *priv = nft_expr_priv(expr); return nft_synproxy_do_dump(skb, priv); } static struct nft_expr_type nft_synproxy_type; static const struct nft_expr_ops nft_synproxy_ops = { .eval = nft_synproxy_eval, .size = NFT_EXPR_SIZE(sizeof(struct nft_synproxy)), .init = nft_synproxy_init, .destroy = nft_synproxy_destroy, .dump = nft_synproxy_dump, .type = &nft_synproxy_type, .validate = nft_synproxy_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_synproxy_type __read_mostly = { .ops = &nft_synproxy_ops, .name = "synproxy", .owner = THIS_MODULE, .policy = nft_synproxy_policy, .maxattr = NFTA_SYNPROXY_MAX, }; static int nft_synproxy_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_synproxy *priv = nft_obj_data(obj); return nft_synproxy_do_init(ctx, tb, priv); } static void nft_synproxy_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { nft_synproxy_do_destroy(ctx); } static int nft_synproxy_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { struct nft_synproxy *priv = nft_obj_data(obj); return nft_synproxy_do_dump(skb, priv); } static void nft_synproxy_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_synproxy *priv = nft_obj_data(obj); nft_synproxy_do_eval(priv, regs, pkt); } static void nft_synproxy_obj_update(struct nft_object *obj, struct nft_object *newobj) { struct nft_synproxy *newpriv = nft_obj_data(newobj); struct nft_synproxy *priv = nft_obj_data(obj); priv->info = newpriv->info; } static struct nft_object_type nft_synproxy_obj_type; static const struct nft_object_ops nft_synproxy_obj_ops = { .type = &nft_synproxy_obj_type, .size = sizeof(struct nft_synproxy), .init = nft_synproxy_obj_init, .destroy = nft_synproxy_obj_destroy, .dump = nft_synproxy_obj_dump, .eval = nft_synproxy_obj_eval, .update = nft_synproxy_obj_update, }; static struct nft_object_type nft_synproxy_obj_type __read_mostly = { .type = NFT_OBJECT_SYNPROXY, .ops = &nft_synproxy_obj_ops, .maxattr = NFTA_SYNPROXY_MAX, .policy = nft_synproxy_policy, .owner = THIS_MODULE, }; static int __init nft_synproxy_module_init(void) { int err; err = nft_register_obj(&nft_synproxy_obj_type); if (err < 0) return err; err = nft_register_expr(&nft_synproxy_type); if (err < 0) goto err; return 0; err: nft_unregister_obj(&nft_synproxy_obj_type); return err; } static void __exit nft_synproxy_module_exit(void) { nft_unregister_expr(&nft_synproxy_type); nft_unregister_obj(&nft_synproxy_obj_type); } module_init(nft_synproxy_module_init); module_exit(nft_synproxy_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>"); MODULE_ALIAS_NFT_EXPR("synproxy"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_SYNPROXY); MODULE_DESCRIPTION("nftables SYNPROXY expression support"); |
26 115 150 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_TSTAMP_H #define _NF_CONNTRACK_TSTAMP_H #include <net/net_namespace.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> struct nf_conn_tstamp { u_int64_t start; u_int64_t stop; }; static inline struct nf_conn_tstamp *nf_conn_tstamp_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP return nf_ct_ext_find(ct, NF_CT_EXT_TSTAMP); #else return NULL; #endif } static inline struct nf_conn_tstamp *nf_ct_tstamp_ext_add(struct nf_conn *ct, gfp_t gfp) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP struct net *net = nf_ct_net(ct); if (!net->ct.sysctl_tstamp) return NULL; return nf_ct_ext_add(ct, NF_CT_EXT_TSTAMP, gfp); #else return NULL; #endif }; #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP void nf_conntrack_tstamp_pernet_init(struct net *net); #else static inline void nf_conntrack_tstamp_pernet_init(struct net *net) {} #endif /* CONFIG_NF_CONNTRACK_TIMESTAMP */ #endif /* _NF_CONNTRACK_TSTAMP_H */ |
3 3 3 3 3 1 1 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 | // SPDX-License-Identifier: GPL-2.0 /* * Tty port functions */ #include <linux/types.h> #include <linux/errno.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/serdev.h> #include "tty.h" static size_t tty_port_default_receive_buf(struct tty_port *port, const u8 *p, const u8 *f, size_t count) { struct tty_struct *tty; struct tty_ldisc *ld; tty = READ_ONCE(port->itty); if (!tty) return 0; ld = tty_ldisc_ref(tty); if (!ld) return 0; count = tty_ldisc_receive_buf(ld, p, f, count); tty_ldisc_deref(ld); return count; } static void tty_port_default_lookahead_buf(struct tty_port *port, const u8 *p, const u8 *f, size_t count) { struct tty_struct *tty; struct tty_ldisc *ld; tty = READ_ONCE(port->itty); if (!tty) return; ld = tty_ldisc_ref(tty); if (!ld) return; if (ld->ops->lookahead_buf) ld->ops->lookahead_buf(ld->tty, p, f, count); tty_ldisc_deref(ld); } static void tty_port_default_wakeup(struct tty_port *port) { struct tty_struct *tty = tty_port_tty_get(port); if (tty) { tty_wakeup(tty); tty_kref_put(tty); } } const struct tty_port_client_operations tty_port_default_client_ops = { .receive_buf = tty_port_default_receive_buf, .lookahead_buf = tty_port_default_lookahead_buf, .write_wakeup = tty_port_default_wakeup, }; EXPORT_SYMBOL_GPL(tty_port_default_client_ops); /** * tty_port_init - initialize tty_port * @port: tty_port to initialize * * Initializes the state of struct tty_port. When a port was initialized using * this function, one has to destroy the port by tty_port_destroy(). Either * indirectly by using &tty_port refcounting (tty_port_put()) or directly if * refcounting is not used. */ void tty_port_init(struct tty_port *port) { memset(port, 0, sizeof(*port)); tty_buffer_init(port); init_waitqueue_head(&port->open_wait); init_waitqueue_head(&port->delta_msr_wait); mutex_init(&port->mutex); mutex_init(&port->buf_mutex); spin_lock_init(&port->lock); port->close_delay = (50 * HZ) / 100; port->closing_wait = (3000 * HZ) / 100; port->client_ops = &tty_port_default_client_ops; kref_init(&port->kref); } EXPORT_SYMBOL(tty_port_init); /** * tty_port_link_device - link tty and tty_port * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * * Provide the tty layer with a link from a tty (specified by @index) to a * tty_port (@port). Use this only if neither tty_port_register_device() nor * tty_port_install() is used in the driver. If used, this has to be called * before tty_register_driver(). */ void tty_port_link_device(struct tty_port *port, struct tty_driver *driver, unsigned index) { if (WARN_ON(index >= driver->num)) return; driver->ports[index] = port; } EXPORT_SYMBOL_GPL(tty_port_link_device); /** * tty_port_register_device - register tty device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @device: parent if exists, otherwise NULL * * It is the same as tty_register_device() except the provided @port is linked * to a concrete tty specified by @index. Use this or tty_port_install() (or * both). Call tty_port_link_device() as a last resort. */ struct device *tty_port_register_device(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device) { return tty_port_register_device_attr(port, driver, index, device, NULL, NULL); } EXPORT_SYMBOL_GPL(tty_port_register_device); /** * tty_port_register_device_attr - register tty device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @device: parent if exists, otherwise NULL * @drvdata: Driver data to be set to device. * @attr_grp: Attribute group to be set on device. * * It is the same as tty_register_device_attr() except the provided @port is * linked to a concrete tty specified by @index. Use this or tty_port_install() * (or both). Call tty_port_link_device() as a last resort. */ struct device *tty_port_register_device_attr(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp) { tty_port_link_device(port, driver, index); return tty_register_device_attr(driver, index, device, drvdata, attr_grp); } EXPORT_SYMBOL_GPL(tty_port_register_device_attr); /** * tty_port_register_device_attr_serdev - register tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @host: serial port hardware device * @parent: parent if exists, otherwise NULL * @drvdata: driver data for the device * @attr_grp: attribute group for the device * * Register a serdev or tty device depending on if the parent device has any * defined serdev clients or not. */ struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *host, struct device *parent, void *drvdata, const struct attribute_group **attr_grp) { struct device *dev; tty_port_link_device(port, driver, index); dev = serdev_tty_port_register(port, host, parent, driver, index); if (PTR_ERR(dev) != -ENODEV) { /* Skip creating cdev if we registered a serdev device */ return dev; } return tty_register_device_attr(driver, index, parent, drvdata, attr_grp); } EXPORT_SYMBOL_GPL(tty_port_register_device_attr_serdev); /** * tty_port_register_device_serdev - register tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * @host: serial port hardware controller device * @parent: parent if exists, otherwise NULL * * Register a serdev or tty device depending on if the parent device has any * defined serdev clients or not. */ struct device *tty_port_register_device_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *host, struct device *parent) { return tty_port_register_device_attr_serdev(port, driver, index, host, parent, NULL, NULL); } EXPORT_SYMBOL_GPL(tty_port_register_device_serdev); /** * tty_port_unregister_device - deregister a tty or serdev device * @port: tty_port of the device * @driver: tty_driver for this device * @index: index of the tty * * If a tty or serdev device is registered with a call to * tty_port_register_device_serdev() then this function must be called when * the device is gone. */ void tty_port_unregister_device(struct tty_port *port, struct tty_driver *driver, unsigned index) { int ret; ret = serdev_tty_port_unregister(port); if (ret == 0) return; tty_unregister_device(driver, index); } EXPORT_SYMBOL_GPL(tty_port_unregister_device); int tty_port_alloc_xmit_buf(struct tty_port *port) { /* We may sleep in get_zeroed_page() */ mutex_lock(&port->buf_mutex); if (port->xmit_buf == NULL) { port->xmit_buf = (u8 *)get_zeroed_page(GFP_KERNEL); if (port->xmit_buf) kfifo_init(&port->xmit_fifo, port->xmit_buf, PAGE_SIZE); } mutex_unlock(&port->buf_mutex); if (port->xmit_buf == NULL) return -ENOMEM; return 0; } EXPORT_SYMBOL(tty_port_alloc_xmit_buf); void tty_port_free_xmit_buf(struct tty_port *port) { mutex_lock(&port->buf_mutex); free_page((unsigned long)port->xmit_buf); port->xmit_buf = NULL; INIT_KFIFO(port->xmit_fifo); mutex_unlock(&port->buf_mutex); } EXPORT_SYMBOL(tty_port_free_xmit_buf); /** * tty_port_destroy - destroy inited port * @port: tty port to be destroyed * * When a port was initialized using tty_port_init(), one has to destroy the * port by this function. Either indirectly by using &tty_port refcounting * (tty_port_put()) or directly if refcounting is not used. */ void tty_port_destroy(struct tty_port *port) { tty_buffer_cancel_work(port); tty_buffer_free_all(port); } EXPORT_SYMBOL(tty_port_destroy); static void tty_port_destructor(struct kref *kref) { struct tty_port *port = container_of(kref, struct tty_port, kref); /* check if last port ref was dropped before tty release */ if (WARN_ON(port->itty)) return; free_page((unsigned long)port->xmit_buf); tty_port_destroy(port); if (port->ops && port->ops->destruct) port->ops->destruct(port); else kfree(port); } /** * tty_port_put - drop a reference to tty_port * @port: port to drop a reference of (can be NULL) * * The final put will destroy and free up the @port using * @port->ops->destruct() hook, or using kfree() if not provided. */ void tty_port_put(struct tty_port *port) { if (port) kref_put(&port->kref, tty_port_destructor); } EXPORT_SYMBOL(tty_port_put); /** * tty_port_tty_get - get a tty reference * @port: tty port * * Return a refcount protected tty instance or %NULL if the port is not * associated with a tty (eg due to close or hangup). */ struct tty_struct *tty_port_tty_get(struct tty_port *port) { unsigned long flags; struct tty_struct *tty; spin_lock_irqsave(&port->lock, flags); tty = tty_kref_get(port->tty); spin_unlock_irqrestore(&port->lock, flags); return tty; } EXPORT_SYMBOL(tty_port_tty_get); /** * tty_port_tty_set - set the tty of a port * @port: tty port * @tty: the tty * * Associate the port and tty pair. Manages any internal refcounts. Pass %NULL * to deassociate a port. */ void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&port->lock, flags); tty_kref_put(port->tty); port->tty = tty_kref_get(tty); spin_unlock_irqrestore(&port->lock, flags); } EXPORT_SYMBOL(tty_port_tty_set); /** * tty_port_shutdown - internal helper to shutdown the device * @port: tty port to be shut down * @tty: the associated tty * * It is used by tty_port_hangup() and tty_port_close(). Its task is to * shutdown the device if it was initialized (note consoles remain * functioning). It lowers DTR/RTS (if @tty has HUPCL set) and invokes * @port->ops->shutdown(). */ static void tty_port_shutdown(struct tty_port *port, struct tty_struct *tty) { mutex_lock(&port->mutex); if (port->console) goto out; if (tty_port_initialized(port)) { tty_port_set_initialized(port, false); /* * Drop DTR/RTS if HUPCL is set. This causes any attached * modem to hang up the line. */ if (tty && C_HUPCL(tty)) tty_port_lower_dtr_rts(port); if (port->ops->shutdown) port->ops->shutdown(port); } out: mutex_unlock(&port->mutex); } /** * tty_port_hangup - hangup helper * @port: tty port * * Perform port level tty hangup flag and count changes. Drop the tty * reference. * * Caller holds tty lock. */ void tty_port_hangup(struct tty_port *port) { struct tty_struct *tty; unsigned long flags; spin_lock_irqsave(&port->lock, flags); port->count = 0; tty = port->tty; if (tty) set_bit(TTY_IO_ERROR, &tty->flags); port->tty = NULL; spin_unlock_irqrestore(&port->lock, flags); tty_port_set_active(port, false); tty_port_shutdown(port, tty); tty_kref_put(tty); wake_up_interruptible(&port->open_wait); wake_up_interruptible(&port->delta_msr_wait); } EXPORT_SYMBOL(tty_port_hangup); /** * tty_port_tty_hangup - helper to hang up a tty * @port: tty port * @check_clocal: hang only ttys with %CLOCAL unset? */ void tty_port_tty_hangup(struct tty_port *port, bool check_clocal) { struct tty_struct *tty = tty_port_tty_get(port); if (tty && (!check_clocal || !C_CLOCAL(tty))) tty_hangup(tty); tty_kref_put(tty); } EXPORT_SYMBOL_GPL(tty_port_tty_hangup); /** * tty_port_tty_wakeup - helper to wake up a tty * @port: tty port */ void tty_port_tty_wakeup(struct tty_port *port) { port->client_ops->write_wakeup(port); } EXPORT_SYMBOL_GPL(tty_port_tty_wakeup); /** * tty_port_carrier_raised - carrier raised check * @port: tty port * * Wrapper for the carrier detect logic. For the moment this is used * to hide some internal details. This will eventually become entirely * internal to the tty port. */ bool tty_port_carrier_raised(struct tty_port *port) { if (port->ops->carrier_raised == NULL) return true; return port->ops->carrier_raised(port); } EXPORT_SYMBOL(tty_port_carrier_raised); /** * tty_port_raise_dtr_rts - Raise DTR/RTS * @port: tty port * * Wrapper for the DTR/RTS raise logic. For the moment this is used to hide * some internal details. This will eventually become entirely internal to the * tty port. */ void tty_port_raise_dtr_rts(struct tty_port *port) { if (port->ops->dtr_rts) port->ops->dtr_rts(port, true); } EXPORT_SYMBOL(tty_port_raise_dtr_rts); /** * tty_port_lower_dtr_rts - Lower DTR/RTS * @port: tty port * * Wrapper for the DTR/RTS raise logic. For the moment this is used to hide * some internal details. This will eventually become entirely internal to the * tty port. */ void tty_port_lower_dtr_rts(struct tty_port *port) { if (port->ops->dtr_rts) port->ops->dtr_rts(port, false); } EXPORT_SYMBOL(tty_port_lower_dtr_rts); /** * tty_port_block_til_ready - Waiting logic for tty open * @port: the tty port being opened * @tty: the tty device being bound * @filp: the file pointer of the opener or %NULL * * Implement the core POSIX/SuS tty behaviour when opening a tty device. * Handles: * * - hangup (both before and during) * - non blocking open * - rts/dtr/dcd * - signals * - port flags and counts * * The passed @port must implement the @port->ops->carrier_raised method if it * can do carrier detect and the @port->ops->dtr_rts method if it supports * software management of these lines. Note that the dtr/rts raise is done each * iteration as a hangup may have previously dropped them while we wait. * * Caller holds tty lock. * * Note: May drop and reacquire tty lock when blocking, so @tty and @port may * have changed state (eg., may have been hung up). */ int tty_port_block_til_ready(struct tty_port *port, struct tty_struct *tty, struct file *filp) { int do_clocal = 0, retval; unsigned long flags; DEFINE_WAIT(wait); /* if non-blocking mode is set we can pass directly to open unless * the port has just hung up or is in another error state. */ if (tty_io_error(tty)) { tty_port_set_active(port, true); return 0; } if (filp == NULL || (filp->f_flags & O_NONBLOCK)) { /* Indicate we are open */ if (C_BAUD(tty)) tty_port_raise_dtr_rts(port); tty_port_set_active(port, true); return 0; } if (C_CLOCAL(tty)) do_clocal = 1; /* Block waiting until we can proceed. We may need to wait for the * carrier, but we must also wait for any close that is in progress * before the next open may complete. */ retval = 0; /* The port lock protects the port counts */ spin_lock_irqsave(&port->lock, flags); port->count--; port->blocked_open++; spin_unlock_irqrestore(&port->lock, flags); while (1) { /* Indicate we are open */ if (C_BAUD(tty) && tty_port_initialized(port)) tty_port_raise_dtr_rts(port); prepare_to_wait(&port->open_wait, &wait, TASK_INTERRUPTIBLE); /* Check for a hangup or uninitialised port. * Return accordingly. */ if (tty_hung_up_p(filp) || !tty_port_initialized(port)) { if (port->flags & ASYNC_HUP_NOTIFY) retval = -EAGAIN; else retval = -ERESTARTSYS; break; } /* * Probe the carrier. For devices with no carrier detect * tty_port_carrier_raised will always return true. * Never ask drivers if CLOCAL is set, this causes troubles * on some hardware. */ if (do_clocal || tty_port_carrier_raised(port)) break; if (signal_pending(current)) { retval = -ERESTARTSYS; break; } tty_unlock(tty); schedule(); tty_lock(tty); } finish_wait(&port->open_wait, &wait); /* Update counts. A parallel hangup will have set count to zero and * we must not mess that up further. */ spin_lock_irqsave(&port->lock, flags); if (!tty_hung_up_p(filp)) port->count++; port->blocked_open--; spin_unlock_irqrestore(&port->lock, flags); if (retval == 0) tty_port_set_active(port, true); return retval; } EXPORT_SYMBOL(tty_port_block_til_ready); static void tty_port_drain_delay(struct tty_port *port, struct tty_struct *tty) { unsigned int bps = tty_get_baud_rate(tty); long timeout; if (bps > 1200) { timeout = (HZ * 10 * port->drain_delay) / bps; timeout = max_t(long, timeout, HZ / 10); } else { timeout = 2 * HZ; } schedule_timeout_interruptible(timeout); } /** * tty_port_close_start - helper for tty->ops->close, part 1/2 * @port: tty_port of the device * @tty: tty being closed * @filp: passed file pointer * * Decrements and checks open count. Flushes the port if this is the last * close. That means, dropping the data from the outpu buffer on the device and * waiting for sending logic to finish. The rest of close handling is performed * in tty_port_close_end(). * * Locking: Caller holds tty lock. * * Return: 1 if this is the last close, otherwise 0 */ int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct file *filp) { unsigned long flags; if (tty_hung_up_p(filp)) return 0; spin_lock_irqsave(&port->lock, flags); if (tty->count == 1 && port->count != 1) { tty_warn(tty, "%s: tty->count = 1 port count = %d\n", __func__, port->count); port->count = 1; } if (--port->count < 0) { tty_warn(tty, "%s: bad port count (%d)\n", __func__, port->count); port->count = 0; } if (port->count) { spin_unlock_irqrestore(&port->lock, flags); return 0; } spin_unlock_irqrestore(&port->lock, flags); tty->closing = 1; if (tty_port_initialized(port)) { /* Don't block on a stalled port, just pull the chain */ if (tty->flow.tco_stopped) tty_driver_flush_buffer(tty); if (port->closing_wait != ASYNC_CLOSING_WAIT_NONE) tty_wait_until_sent(tty, port->closing_wait); if (port->drain_delay) tty_port_drain_delay(port, tty); } /* Flush the ldisc buffering */ tty_ldisc_flush(tty); /* Report to caller this is the last port reference */ return 1; } EXPORT_SYMBOL(tty_port_close_start); /** * tty_port_close_end - helper for tty->ops->close, part 2/2 * @port: tty_port of the device * @tty: tty being closed * * This is a continuation of the first part: tty_port_close_start(). This * should be called after turning off the device. It flushes the data from the * line discipline and delays the close by @port->close_delay. * * Locking: Caller holds tty lock. */ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty) { unsigned long flags; tty_ldisc_flush(tty); tty->closing = 0; spin_lock_irqsave(&port->lock, flags); if (port->blocked_open) { spin_unlock_irqrestore(&port->lock, flags); if (port->close_delay) msleep_interruptible(jiffies_to_msecs(port->close_delay)); spin_lock_irqsave(&port->lock, flags); wake_up_interruptible(&port->open_wait); } spin_unlock_irqrestore(&port->lock, flags); tty_port_set_active(port, false); } EXPORT_SYMBOL(tty_port_close_end); /** * tty_port_close - generic tty->ops->close handler * @port: tty_port of the device * @tty: tty being closed * @filp: passed file pointer * * It is a generic helper to be used in driver's @tty->ops->close. It wraps a * sequence of tty_port_close_start(), tty_port_shutdown(), and * tty_port_close_end(). The latter two are called only if this is the last * close. See the respective functions for the details. * * Locking: Caller holds tty lock */ void tty_port_close(struct tty_port *port, struct tty_struct *tty, struct file *filp) { if (tty_port_close_start(port, tty, filp) == 0) return; tty_port_shutdown(port, tty); if (!port->console) set_bit(TTY_IO_ERROR, &tty->flags); tty_port_close_end(port, tty); tty_port_tty_set(port, NULL); } EXPORT_SYMBOL(tty_port_close); /** * tty_port_install - generic tty->ops->install handler * @port: tty_port of the device * @driver: tty_driver for this device * @tty: tty to be installed * * It is the same as tty_standard_install() except the provided @port is linked * to a concrete tty specified by @tty. Use this or tty_port_register_device() * (or both). Call tty_port_link_device() as a last resort. */ int tty_port_install(struct tty_port *port, struct tty_driver *driver, struct tty_struct *tty) { tty->port = port; return tty_standard_install(driver, tty); } EXPORT_SYMBOL_GPL(tty_port_install); /** * tty_port_open - generic tty->ops->open handler * @port: tty_port of the device * @tty: tty to be opened * @filp: passed file pointer * * It is a generic helper to be used in driver's @tty->ops->open. It activates * the devices using @port->ops->activate if not active already. And waits for * the device to be ready using tty_port_block_til_ready() (e.g. raises * DTR/CTS and waits for carrier). * * Note that @port->ops->shutdown is not called when @port->ops->activate * returns an error (on the contrary, @tty->ops->close is). * * Locking: Caller holds tty lock. * * Note: may drop and reacquire tty lock (in tty_port_block_til_ready()) so * @tty and @port may have changed state (eg., may be hung up now). */ int tty_port_open(struct tty_port *port, struct tty_struct *tty, struct file *filp) { spin_lock_irq(&port->lock); ++port->count; spin_unlock_irq(&port->lock); tty_port_tty_set(port, tty); /* * Do the device-specific open only if the hardware isn't * already initialized. Serialize open and shutdown using the * port mutex. */ mutex_lock(&port->mutex); if (!tty_port_initialized(port)) { clear_bit(TTY_IO_ERROR, &tty->flags); if (port->ops->activate) { int retval = port->ops->activate(port, tty); if (retval) { mutex_unlock(&port->mutex); return retval; } } tty_port_set_initialized(port, true); } mutex_unlock(&port->mutex); return tty_port_block_til_ready(port, tty, filp); } EXPORT_SYMBOL(tty_port_open); |
8 8 8 8 8 1281 1285 1280 282 228 1284 3 8 8 8 8 8 8 8 7 8 8 8 8 2 2 2 2 7 5 2 2 3 1295 1296 16 1284 19 1283 1285 877 412 1277 1297 642 644 46 612 177 1120 1126 1125 1134 474 470 471 11 3 3 1 3 471 3 3 3 654 20784 3 20842 20871 20854 20848 228 13 7 8 213 5 649 20690 19407 20917 99 97 1565 145 1715 1707 1713 143 3 3 1 2 5 1 4 1 4 5 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes * * Manage the dynamic fd arrays in the process files_struct. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> #include <net/sock.h> #include "internal.h" unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); kvfree(fdt->open_fds); kfree(fdt); } static void free_fdtable_rcu(struct rcu_head *rcu) { __free_fdtable(container_of(rcu, struct fdtable, rcu)); } #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, unsigned int copy_words) { unsigned int nwords = fdt_words(nfdt); bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, copy_words, nwords); } /* * Copy all file descriptors from the old table to the new, expanded table and * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } /* * Note how the fdtable bitmap allocations very much have to be a multiple of * BITS_PER_LONG. This is not only because we walk those things in chunks of * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". * * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied * by that "1024/sizeof(ptr)" before, we already know there are sufficient * clear low bits. Clang seems to realize that, gcc ends up being confused. * * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, * let's consider it documentation (and maybe a test-case for gcc to improve * its code generation ;) */ static struct fdtable * alloc_fdtable(unsigned int nr) { struct fdtable *fdt; void *data; /* * Figure out how many fds we actually want to support in this fdtable. * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B * and growing in powers of two from there on. */ nr /= (1024 / sizeof(struct file *)); nr = roundup_pow_of_two(nr + 1); nr *= (1024 / sizeof(struct file *)); nr = ALIGN(nr, BITS_PER_LONG); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. Deal * with that in caller, it's cheaper that way. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ if (unlikely(nr > sysctl_nr_open)) nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; data = kvmalloc(max_t(size_t, 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; data += nr / BITS_PER_BYTE; fdt->full_fds_bits = data; return fdt; out_arr: kvfree(fdt->fd); out_fdt: kfree(fdt); out: return NULL; } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. * Return <0 error code on error; 1 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) synchronize_rcu(); spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; /* * extremely unlikely race - sysctl_nr_open decreased between the check in * caller and alloc_fdtable(). Cheaper to catch it here... */ if (unlikely(new_fdt->max_fds <= nr)) { __free_fdtable(new_fdt); return -EMFILE; } cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); return 1; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. * Return <0 error code on error; 0 when nothing done; 1 when files were * expanded and execution may have blocked. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *fdt; int expanded = 0; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) return expanded; /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); expanded = 1; wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } /* All good, so we try */ files->resize_in_progress = true; expanded = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); return expanded; } static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->close_on_exec); } static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); } static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->open_fds); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); } static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->open_fds); } /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * * punch_hole is optional - when close_range() is asked to unshare * and close, we don't need to copy descriptors in that range, so * a smaller cloned descriptor table might suffice if the last * currently opened descriptor falls into that range. */ static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) { unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); if (last == fdt->max_fds) return NR_OPEN_DEFAULT; if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { last = find_last_bit(fdt->open_fds, punch_hole->from); if (last == punch_hole->from) return NR_OPEN_DEFAULT; } return ALIGN(last + 1, BITS_PER_LONG); } /* * Allocate a new descriptor table and copy contents from the passed in * instance. Returns a pointer to cloned table on success, ERR_PTR() * on failure. For 'punch_hole' see sane_fdtable_size(). */ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; int error; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) return ERR_PTR(-ENOMEM); atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); newf->resize_in_progress = false; init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. */ while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); new_fdt = alloc_fdtable(open_files - 1); if (!new_fdt) { error = -ENOMEM; goto out_release; } /* beyond sysctl_nr_open; nothing to do */ if (unlikely(new_fdt->max_fds < open_files)) { __free_fdtable(new_fdt); error = -EMFILE; goto out_release; } /* * Reacquire the oldf lock and a pointer to its fd table * who knows it may have a new bigger fd table. We need * the latest pointer. */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); } copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); old_fds = old_fdt->fd; new_fds = new_fdt->fd; for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) { get_file(f); } else { /* * The fd may be claimed in the fd bitmap but not yet * instantiated in the files array if a sibling thread * is partway through open(). So make sure that this * fd is available to the new process. */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* clear the remainder */ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); return newf; out_release: kmem_cache_free(files_cachep, newf); return ERR_PTR(error); } static struct fdtable *close_files(struct files_struct * files) { /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned int i, j = 0; for (;;) { unsigned long set; i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); if (file) { filp_close(file, files); cond_resched(); } } i++; set >>= 1; } } return fdt; } void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { struct fdtable *fdt = close_files(files); /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); kmem_cache_free(files_cachep, files); } } void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; if (files) { task_lock(tsk); tsk->files = NULL; task_unlock(tsk); put_files_struct(files); } } struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit >= maxfd) return maxfd; if (bitbit > start) start = bitbit; return find_next_zero_bit(fdt->open_fds, maxfd, start); } /* * allocate a file descriptor, mark it busy. */ static int alloc_fd(unsigned start, unsigned end, unsigned flags) { struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (fd < fdt->max_fds) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (fd >= end) goto out; error = expand_files(files, fd); if (error < 0) goto out; /* * If we needed to expand the fs array we * might have blocked - try again. */ if (error) goto repeat; if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); error = fd; #if 1 /* Sanity check */ if (rcu_access_pointer(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); rcu_assign_pointer(fdt->fd[fd], NULL); } #endif out: spin_unlock(&files->file_lock); return error; } int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { return alloc_fd(0, nofile, flags); } int get_unused_fd_flags(unsigned flags) { return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } EXPORT_SYMBOL(put_unused_fd); /* * Install a file pointer in the fd array. * * The VFS is full of places where we drop the files lock between * setting the open_fds bitmap and installing the file in the file * array. At any such point, we are vulnerable to a dup2() race * installing a file in the array before us. We need to detect this and * fput() the struct file we are about to overwrite in this case. * * It should never happen - if we allow dup2() do it, _really_ bad things * will follow. * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) return; rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } EXPORT_SYMBOL(fd_install); /** * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; lockdep_assert_held(&files->file_lock); if (fd >= fdt->max_fds) return NULL; fd = array_index_nospec(fd, fdt->max_fds); file = fdt->fd[fd]; if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); } return file; } int close_fd(unsigned fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; return filp_close(file, files); } EXPORT_SYMBOL(close_fd); /** * last_fd - return last valid index into fd table * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * * Returns: Last valid index into fdtable. */ static inline unsigned last_fd(struct fdtable *fdt) { return fdt->max_fds - 1; } static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); max_fd = min(last_fd(fdt), max_fd); if (fd <= max_fd) bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { struct file *file; unsigned n; spin_lock(&files->file_lock); n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { file = file_close_fd_locked(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } else if (need_resched()) { spin_unlock(&files->file_lock); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } /** * __close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. */ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) return -EINVAL; if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { struct fd_range range = {fd, max_fd}, *punch_hole = ⦥ /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ if (flags & CLOSE_RANGE_CLOEXEC) punch_hole = NULL; fds = dup_fd(cur_fds, punch_hole); if (IS_ERR(fds)) return PTR_ERR(fds); /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) __range_cloexec(cur_fds, fd, max_fd); else __range_close(cur_fds, fd, max_fd); if (fds) { /* * We're done closing the files we were supposed to. Time to install * the new file descriptor table and drop the old one. */ task_lock(me); me->files = cur_fds; task_unlock(me); put_files_struct(fds); } return 0; } /** * file_close_fd - return file associated with fd * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; } void do_close_on_exec(struct files_struct *files) { unsigned i; struct fdtable *fdt; /* exec unshares first */ spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; unsigned fd = i * BITS_PER_LONG; fdt = files_fdtable(files); if (fd >= fdt->max_fds) break; set = fdt->close_on_exec[i]; if (!set) continue; fdt->close_on_exec[i] = 0; for ( ; set ; fd++, set >>= 1) { struct file *file; if (!(set & 1)) continue; file = fdt->fd[fd]; if (!file) continue; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } static struct file *__get_file_rcu(struct file __rcu **f) { struct file __rcu *file; struct file __rcu *file_reloaded; struct file __rcu *file_reloaded_cmp; file = rcu_dereference_raw(*f); if (!file) return NULL; if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) return ERR_PTR(-EAGAIN); file_reloaded = rcu_dereference_raw(*f); /* * Ensure that all accesses have a dependency on the load from * rcu_dereference_raw() above so we get correct ordering * between reuse/allocation and the pointer check below. */ file_reloaded_cmp = file_reloaded; OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* * atomic_long_inc_not_zero() above provided a full memory * barrier when we acquired a reference. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still * matches the current file, we know we have successfully * acquired a reference to the right file. * * If the pointers don't match the file has been reallocated by * SLAB_TYPESAFE_BY_RCU. */ if (file == file_reloaded_cmp) return file_reloaded; fput(file); return ERR_PTR(-EAGAIN); } /** * get_file_rcu - try go get a reference to a file under rcu * @f: the file to get a reference on * * This function tries to get a reference on @f carefully verifying that * @f hasn't been reused. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_rcu(struct file __rcu **f) { for (;;) { struct file __rcu *file; file = __get_file_rcu(f); if (!IS_ERR(file)) return file; } } EXPORT_SYMBOL_GPL(get_file_rcu); /** * get_file_active - try go get a reference to a file * @f: the file to get a reference on * * In contast to get_file_rcu() the pointer itself isn't part of the * reference counting. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_active(struct file **f) { struct file __rcu *file; rcu_read_lock(); file = __get_file_rcu(f); rcu_read_unlock(); if (IS_ERR(file)) file = NULL; return file; } EXPORT_SYMBOL_GPL(get_file_active); static inline struct file *__fget_files_rcu(struct files_struct *files, unsigned int fd, fmode_t mask) { for (;;) { struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; unsigned long nospec_mask; /* Mask is a 0 for invalid fd's, ~0 for valid ones */ nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); /* * fdentry points to the 'fd' offset, or fdt->fd[0]. * Loading from fdt->fd[0] is always safe, because the * array always exists. */ fdentry = fdt->fd + (fd & nospec_mask); /* Do the load, then mask any invalid result */ file = rcu_dereference_raw(*fdentry); file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; /* * Ok, we have a file pointer that was valid at * some point, but it might have become stale since. * * We need to confirm it by incrementing the refcount * and then check the lookup again. * * atomic_long_inc_not_zero() gives us a full memory * barrier. We only really need an 'acquire' one to * protect the loads below, but we don't have that. */ if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) continue; /* * Such a race can take two forms: * * (a) the file ref already went down to zero and the * file hasn't been reused yet or the file count * isn't zero but the file has already been reused. * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * * If so, we need to put our ref and try again. */ if (unlikely(file != rcu_dereference_raw(*fdentry)) || unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } /* * This isn't the file we're looking for or we're not * allowed to get a reference to it. */ if (unlikely(file->f_mode & mask)) { fput(file); return NULL; } /* * Ok, we have a ref to the file, and checked that it * still exists. */ return file; } } static struct file *__fget_files(struct files_struct *files, unsigned int fd, fmode_t mask) { struct file *file; rcu_read_lock(); file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } static inline struct file *__fget(unsigned int fd, fmode_t mask) { return __fget_files(current->files, fd, mask); } struct file *fget(unsigned int fd) { return __fget(fd, FMODE_PATH); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { return __fget(fd, 0); } EXPORT_SYMBOL(fget_raw); struct file *fget_task(struct task_struct *task, unsigned int fd) { struct file *file = NULL; task_lock(task); if (task->files) file = __fget_files(task->files, fd, 0); task_unlock(task); return file; } struct file *lookup_fdget_rcu(unsigned int fd) { return __fget_files_rcu(current->files, fd, 0); } EXPORT_SYMBOL_GPL(lookup_fdget_rcu); struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; struct file *file = NULL; task_lock(task); files = task->files; if (files) file = __fget_files_rcu(files, fd, 0); task_unlock(task); return file; } struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; unsigned int fd = *ret_fd; struct file *file = NULL; task_lock(task); files = task->files; if (files) { for (; fd < files_fdtable(files)->max_fds; fd++) { file = __fget_files_rcu(files, fd, 0); if (file) break; } } task_unlock(task); *ret_fd = fd; return file; } EXPORT_SYMBOL(task_lookup_next_fdget_rcu); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * * You can use this instead of fget if you satisfy all of the following * conditions: * 1) You must call fput_light before exiting the syscall and returning control * to userspace (i.e. you cannot remember the returned struct file * after * returning to userspace). * 2) You must not call filp_close on the returned struct file * in between * calls to fget_light and fput_light. * 3) You must not clone the current task in between the calls to fget_light * and fput_light. * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. */ static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; /* * If another thread is concurrently calling close_fd() followed * by put_files_struct(), we must not observe the old table * entry combined with the new refcount - otherwise we could * return a file that is concurrently being freed. * * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return EMPTY_FD; return BORROWED_FD(file); } else { file = __fget_files(files, fd, mask); if (!file) return EMPTY_FD; return CLONED_FD(file); } } struct fd fdget(unsigned int fd) { return __fget_light(fd, FMODE_PATH); } EXPORT_SYMBOL(fdget); struct fd fdget_raw(unsigned int fd) { return __fget_light(fd, 0); } /* * Try to avoid f_pos locking. We only need it if the * file is marked for FMODE_ATOMIC_POS, and it can be * accessed multiple ways. * * Always do it for directories, because pidfd_getfd() * can make a file accessible even if it otherwise would * not be, and for directories this is a correctness * issue, not a "POSIX requirement". */ static inline bool file_needs_f_pos_lock(struct file *file) { return (file->f_mode & FMODE_ATOMIC_POS) && (file_count(file) > 1 || file->f_op->iterate_shared); } struct fd fdget_pos(unsigned int fd) { struct fd f = fdget(fd); struct file *file = fd_file(f); if (file && file_needs_f_pos_lock(file)) { f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } return f; } void __f_unlock_pos(struct file *f) { mutex_unlock(&f->f_pos_lock); } /* * We only lock f_pos if we have threads or if the file might be * shared with another process. In both cases we'll have an elevated * file count (done either by fdget() or by fork()). */ void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); if (flag) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { bool res; rcu_read_lock(); res = close_on_exec(fd, current->files); rcu_read_unlock(); return res; } static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; /* * We need to detect attempts to do dup2() over allocated but still * not finished descriptor. NB: OpenBSD avoids that at the price of * extra work in their equivalent of fget() - they insert struct * file immediately after grabbing descriptor, mark it larval if * more work (e.g. actual opening) is needed and make sure that * fget() treats larval files as absent. Potentially interesting, * but while extra work in fget() is trivial, locking implications * and amount of surgery on open()-related paths in VFS are not. * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" * deadlocks in rather amusing ways, AFAICS. All of that is out of * scope of POSIX or SUS, since neither considers shared descriptor * tables and this condition does not arise without those. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); tofree = fdt->fd[fd]; if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); if (tofree) filp_close(tofree, files); return fd; Ebusy: spin_unlock(&files->file_lock); return -EBUSY; } int replace_fd(unsigned fd, struct file *file, unsigned flags) { int err; struct files_struct *files = current->files; if (!file) return close_fd(fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; return do_dup2(files, file, fd, flags); out_unlock: spin_unlock(&files->file_lock); return err; } /** * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry * * Installs a received file into the file descriptor table, with appropriate * checks and count updates. Optionally writes the fd number to userspace, if * @ufd is non-NULL. * * This helper handles its own reference counting of the incoming * struct file. * * Returns newly install fd or -ve on error. */ int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; error = security_file_receive(file); if (error) return error; new_fd = get_unused_fd_flags(o_flags); if (new_fd < 0) return new_fd; if (ufd) { error = put_user(new_fd, ufd); if (error) { put_unused_fd(new_fd); return error; } } fd_install(new_fd, get_file(file)); __receive_sock(file); return new_fd; } EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { int error; error = security_file_receive(file); if (error) return error; error = replace_fd(new_fd, file, o_flags); if (error) return error; __receive_sock(file); return new_fd; } static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; struct files_struct *files = current->files; if ((flags & ~O_CLOEXEC) != 0) return -EINVAL; if (unlikely(oldfd == newfd)) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { if (err == -EMFILE) goto Ebadf; goto out_unlock; } return do_dup2(files, file, newfd, flags); Ebadf: err = -EBADF; out_unlock: spin_unlock(&files->file_lock); return err; } SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) { return ksys_dup3(oldfd, newfd, flags); } SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; struct file *f; int retval = oldfd; rcu_read_lock(); f = __fget_files_rcu(files, oldfd, 0); if (!f) retval = -EBADF; rcu_read_unlock(); if (f) fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); } SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else fput(file); } return ret; } int f_dupfd(unsigned int from, struct file *file, unsigned flags) { unsigned long nofile = rlimit(RLIMIT_NOFILE); int err; if (from >= nofile) return -EINVAL; err = alloc_fd(from, nofile, flags); if (err >= 0) { get_file(file); fd_install(err, file); } return err; } int iterate_fd(struct files_struct *files, unsigned n, int (*f)(const void *, struct file *, unsigned), const void *p) { struct fdtable *fdt; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { struct file *file; file = rcu_dereference_check_fdtable(files, fdt->fd[n]); if (!file) continue; res = f(p, file, n); if (res) break; } spin_unlock(&files->file_lock); return res; } EXPORT_SYMBOL(iterate_fd); |
11 11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor task related definitions and mediation * * Copyright 2017 Canonical Ltd. */ #ifndef __AA_TASK_H #define __AA_TASK_H static inline struct aa_task_ctx *task_ctx(struct task_struct *task) { return task->security + apparmor_blob_sizes.lbs_task; } /* * struct aa_task_ctx - information for current task label change * @nnp: snapshot of label at time of no_new_privs * @onexec: profile to transition to on next exec (MAY BE NULL) * @previous: profile the task may return to (MAY BE NULL) * @token: magic value the task must know for returning to @previous_profile */ struct aa_task_ctx { struct aa_label *nnp; struct aa_label *onexec; struct aa_label *previous; u64 token; }; int aa_replace_current_label(struct aa_label *label); void aa_set_current_onexec(struct aa_label *label, bool stack); int aa_set_current_hat(struct aa_label *label, u64 token); int aa_restore_previous_label(u64 cookie); struct aa_label *aa_get_task_label(struct task_struct *task); /** * aa_free_task_ctx - free a task_ctx * @ctx: task_ctx to free (MAYBE NULL) */ static inline void aa_free_task_ctx(struct aa_task_ctx *ctx) { if (ctx) { aa_put_label(ctx->nnp); aa_put_label(ctx->previous); aa_put_label(ctx->onexec); } } /** * aa_dup_task_ctx - duplicate a task context, incrementing reference counts * @new: a blank task context (NOT NULL) * @old: the task context to copy (NOT NULL) */ static inline void aa_dup_task_ctx(struct aa_task_ctx *new, const struct aa_task_ctx *old) { *new = *old; aa_get_label(new->nnp); aa_get_label(new->previous); aa_get_label(new->onexec); } /** * aa_clear_task_ctx_trans - clear transition tracking info from the ctx * @ctx: task context to clear (NOT NULL) */ static inline void aa_clear_task_ctx_trans(struct aa_task_ctx *ctx) { AA_BUG(!ctx); aa_put_label(ctx->previous); aa_put_label(ctx->onexec); ctx->previous = NULL; ctx->onexec = NULL; ctx->token = 0; } #define AA_PTRACE_TRACE MAY_WRITE #define AA_PTRACE_READ MAY_READ #define AA_MAY_BE_TRACED AA_MAY_APPEND #define AA_MAY_BE_READ AA_MAY_CREATE #define PTRACE_PERM_SHIFT 2 #define AA_PTRACE_PERM_MASK (AA_PTRACE_READ | AA_PTRACE_TRACE | \ AA_MAY_BE_READ | AA_MAY_BE_TRACED) #define AA_SIGNAL_PERM_MASK (MAY_READ | MAY_WRITE) #define AA_SFS_SIG_MASK "hup int quit ill trap abrt bus fpe kill usr1 " \ "segv usr2 pipe alrm term stkflt chld cont stop stp ttin ttou urg " \ "xcpu xfsz vtalrm prof winch io pwr sys emt lost" int aa_may_ptrace(const struct cred *tracer_cred, struct aa_label *tracer, const struct cred *tracee_cred, struct aa_label *tracee, u32 request); #define AA_USERNS_CREATE 8 int aa_profile_ns_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request); #endif /* __AA_TASK_H */ |
2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,mark type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support */ /* 2 skbinfo support */ #define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Vytas Dauksa <vytas.dauksa@smoothwall.net>"); IP_SET_MODULE_DESC("hash:ip,mark", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,mark"); /* Type specific function prefix */ #define HTYPE hash_ipmark #define IP_SET_HASH_WITH_MARKMASK /* IPv4 variant */ /* Member elements */ struct hash_ipmark4_elem { __be32 ip; __u32 mark; }; /* Common functions */ static bool hash_ipmark4_data_equal(const struct hash_ipmark4_elem *ip1, const struct hash_ipmark4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->mark == ip2->mark; } static bool hash_ipmark4_data_list(struct sk_buff *skb, const struct hash_ipmark4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipmark4_data_next(struct hash_ipmark4_elem *next, const struct hash_ipmark4_elem *d) { next->ip = d->ip; } #define MTYPE hash_ipmark4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipmark4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipmark4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); e.mark = skb->mark; e.mark &= h->markmask; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipmark4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, i = 0; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (e.mark == 0 && e.ip == 0) return -IPSET_ERR_HASH_ELEM; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) { if (e.mark == 0 && ip_to == 0) return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++, i++) { e.ip = htonl(ip); if (i > IPSET_MAX_RANGE) { hash_ipmark4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } /* IPv6 variant */ struct hash_ipmark6_elem { union nf_inet_addr ip; __u32 mark; }; /* Common functions */ static bool hash_ipmark6_data_equal(const struct hash_ipmark6_elem *ip1, const struct hash_ipmark6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ip1->mark == ip2->mark; } static bool hash_ipmark6_data_list(struct sk_buff *skb, const struct hash_ipmark6_elem *data) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_net32(skb, IPSET_ATTR_MARK, htonl(data->mark))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipmark6_data_next(struct hash_ipmark6_elem *next, const struct hash_ipmark6_elem *d) { } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipmark6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipmark6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipmark6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); e.mark = skb->mark; e.mark &= h->markmask; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipmark6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipmark6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipmark6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_attr_netorder(tb, IPSET_ATTR_MARK))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; if (adt == IPSET_TEST) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; return 0; } static struct ip_set_type hash_ipmark_type __read_mostly = { .name = "hash:ip,mark", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_MARK, .dimension = IPSET_DIM_TWO, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipmark_create, .create_policy = { [IPSET_ATTR_MARKMASK] = { .type = NLA_U32 }, [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_MARK] = { .type = NLA_U32 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipmark_init(void) { return ip_set_type_register(&hash_ipmark_type); } static void __exit hash_ipmark_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipmark_type); } module_init(hash_ipmark_init); module_exit(hash_ipmark_fini); |
35 107 79 4 45 15 15 13 13 11 11 72 32 34 35 41 41 21 21 7 8 8 12 10 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 | // SPDX-License-Identifier: GPL-2.0-or-later /* * xfrm algorithm interface * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> */ #include <crypto/aead.h> #include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/pfkeyv2.h> #include <linux/crypto.h> #include <linux/scatterlist.h> #include <net/xfrm.h> #if IS_ENABLED(CONFIG_INET_ESP) || IS_ENABLED(CONFIG_INET6_ESP) #include <net/esp.h> #endif /* * Algorithms supported by IPsec. These entries contain properties which * are used in key negotiation and xfrm processing, and are used to verify * that instantiated crypto transforms have correct parameters for IPsec * purposes. */ static struct xfrm_algo_desc aead_list[] = { { .name = "rfc4106(gcm(aes))", .uinfo = { .aead = { .geniv = "seqiv", .icv_truncbits = 64, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV8, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 256 } }, { .name = "rfc4106(gcm(aes))", .uinfo = { .aead = { .geniv = "seqiv", .icv_truncbits = 96, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV12, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 256 } }, { .name = "rfc4106(gcm(aes))", .uinfo = { .aead = { .geniv = "seqiv", .icv_truncbits = 128, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_AES_GCM_ICV16, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 256 } }, { .name = "rfc4309(ccm(aes))", .uinfo = { .aead = { .geniv = "seqiv", .icv_truncbits = 64, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV8, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 256 } }, { .name = "rfc4309(ccm(aes))", .uinfo = { .aead = { .geniv = "seqiv", .icv_truncbits = 96, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV12, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 256 } }, { .name = "rfc4309(ccm(aes))", .uinfo = { .aead = { .geniv = "seqiv", .icv_truncbits = 128, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_AES_CCM_ICV16, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 256 } }, { .name = "rfc4543(gcm(aes))", .uinfo = { .aead = { .geniv = "seqiv", .icv_truncbits = 128, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_NULL_AES_GMAC, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 256 } }, { .name = "rfc7539esp(chacha20,poly1305)", .uinfo = { .aead = { .geniv = "seqiv", .icv_truncbits = 128, } }, .pfkey_supported = 0, }, }; static struct xfrm_algo_desc aalg_list[] = { { .name = "digest_null", .uinfo = { .auth = { .icv_truncbits = 0, .icv_fullbits = 0, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_AALG_NULL, .sadb_alg_ivlen = 0, .sadb_alg_minbits = 0, .sadb_alg_maxbits = 0 } }, { .name = "hmac(md5)", .compat = "md5", .uinfo = { .auth = { .icv_truncbits = 96, .icv_fullbits = 128, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_AALG_MD5HMAC, .sadb_alg_ivlen = 0, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 128 } }, { .name = "hmac(sha1)", .compat = "sha1", .uinfo = { .auth = { .icv_truncbits = 96, .icv_fullbits = 160, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_AALG_SHA1HMAC, .sadb_alg_ivlen = 0, .sadb_alg_minbits = 160, .sadb_alg_maxbits = 160 } }, { .name = "hmac(sha256)", .compat = "sha256", .uinfo = { .auth = { .icv_truncbits = 96, .icv_fullbits = 256, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_AALG_SHA2_256HMAC, .sadb_alg_ivlen = 0, .sadb_alg_minbits = 256, .sadb_alg_maxbits = 256 } }, { .name = "hmac(sha384)", .uinfo = { .auth = { .icv_truncbits = 192, .icv_fullbits = 384, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_AALG_SHA2_384HMAC, .sadb_alg_ivlen = 0, .sadb_alg_minbits = 384, .sadb_alg_maxbits = 384 } }, { .name = "hmac(sha512)", .uinfo = { .auth = { .icv_truncbits = 256, .icv_fullbits = 512, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_AALG_SHA2_512HMAC, .sadb_alg_ivlen = 0, .sadb_alg_minbits = 512, .sadb_alg_maxbits = 512 } }, { .name = "hmac(rmd160)", .compat = "rmd160", .uinfo = { .auth = { .icv_truncbits = 96, .icv_fullbits = 160, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_AALG_RIPEMD160HMAC, .sadb_alg_ivlen = 0, .sadb_alg_minbits = 160, .sadb_alg_maxbits = 160 } }, { .name = "xcbc(aes)", .uinfo = { .auth = { .icv_truncbits = 96, .icv_fullbits = 128, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_AALG_AES_XCBC_MAC, .sadb_alg_ivlen = 0, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 128 } }, { /* rfc4494 */ .name = "cmac(aes)", .uinfo = { .auth = { .icv_truncbits = 96, .icv_fullbits = 128, } }, .pfkey_supported = 0, }, { .name = "hmac(sm3)", .compat = "sm3", .uinfo = { .auth = { .icv_truncbits = 256, .icv_fullbits = 256, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_AALG_SM3_256HMAC, .sadb_alg_ivlen = 0, .sadb_alg_minbits = 256, .sadb_alg_maxbits = 256 } }, }; static struct xfrm_algo_desc ealg_list[] = { { .name = "ecb(cipher_null)", .compat = "cipher_null", .uinfo = { .encr = { .blockbits = 8, .defkeybits = 0, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_EALG_NULL, .sadb_alg_ivlen = 0, .sadb_alg_minbits = 0, .sadb_alg_maxbits = 0 } }, { .name = "cbc(des)", .compat = "des", .uinfo = { .encr = { .geniv = "echainiv", .blockbits = 64, .defkeybits = 64, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_EALG_DESCBC, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 64, .sadb_alg_maxbits = 64 } }, { .name = "cbc(des3_ede)", .compat = "des3_ede", .uinfo = { .encr = { .geniv = "echainiv", .blockbits = 64, .defkeybits = 192, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_EALG_3DESCBC, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 192, .sadb_alg_maxbits = 192 } }, { .name = "cbc(cast5)", .compat = "cast5", .uinfo = { .encr = { .geniv = "echainiv", .blockbits = 64, .defkeybits = 128, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_CASTCBC, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 40, .sadb_alg_maxbits = 128 } }, { .name = "cbc(blowfish)", .compat = "blowfish", .uinfo = { .encr = { .geniv = "echainiv", .blockbits = 64, .defkeybits = 128, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_BLOWFISHCBC, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 40, .sadb_alg_maxbits = 448 } }, { .name = "cbc(aes)", .compat = "aes", .uinfo = { .encr = { .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_AESCBC, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 256 } }, { .name = "cbc(serpent)", .compat = "serpent", .uinfo = { .encr = { .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_SERPENTCBC, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 256, } }, { .name = "cbc(camellia)", .compat = "camellia", .uinfo = { .encr = { .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_CAMELLIACBC, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 256 } }, { .name = "cbc(twofish)", .compat = "twofish", .uinfo = { .encr = { .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_TWOFISHCBC, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 256 } }, { .name = "rfc3686(ctr(aes))", .uinfo = { .encr = { .geniv = "seqiv", .blockbits = 128, .defkeybits = 160, /* 128-bit key + 32-bit nonce */ } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_AESCTR, .sadb_alg_ivlen = 8, .sadb_alg_minbits = 160, .sadb_alg_maxbits = 288 } }, { .name = "cbc(sm4)", .compat = "sm4", .uinfo = { .encr = { .geniv = "echainiv", .blockbits = 128, .defkeybits = 128, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_EALG_SM4CBC, .sadb_alg_ivlen = 16, .sadb_alg_minbits = 128, .sadb_alg_maxbits = 256 } }, }; static struct xfrm_algo_desc calg_list[] = { { .name = "deflate", .uinfo = { .comp = { .threshold = 90, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_CALG_DEFLATE } }, { .name = "lzs", .uinfo = { .comp = { .threshold = 90, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_CALG_LZS } }, { .name = "lzjh", .uinfo = { .comp = { .threshold = 50, } }, .pfkey_supported = 1, .desc = { .sadb_alg_id = SADB_X_CALG_LZJH } }, }; static inline int aalg_entries(void) { return ARRAY_SIZE(aalg_list); } static inline int ealg_entries(void) { return ARRAY_SIZE(ealg_list); } static inline int calg_entries(void) { return ARRAY_SIZE(calg_list); } struct xfrm_algo_list { int (*find)(const char *name, u32 type, u32 mask); struct xfrm_algo_desc *algs; int entries; }; static const struct xfrm_algo_list xfrm_aead_list = { .find = crypto_has_aead, .algs = aead_list, .entries = ARRAY_SIZE(aead_list), }; static const struct xfrm_algo_list xfrm_aalg_list = { .find = crypto_has_ahash, .algs = aalg_list, .entries = ARRAY_SIZE(aalg_list), }; static const struct xfrm_algo_list xfrm_ealg_list = { .find = crypto_has_skcipher, .algs = ealg_list, .entries = ARRAY_SIZE(ealg_list), }; static const struct xfrm_algo_list xfrm_calg_list = { .find = crypto_has_comp, .algs = calg_list, .entries = ARRAY_SIZE(calg_list), }; static struct xfrm_algo_desc *xfrm_find_algo( const struct xfrm_algo_list *algo_list, int match(const struct xfrm_algo_desc *entry, const void *data), const void *data, int probe) { struct xfrm_algo_desc *list = algo_list->algs; int i, status; for (i = 0; i < algo_list->entries; i++) { if (!match(list + i, data)) continue; if (list[i].available) return &list[i]; if (!probe) break; status = algo_list->find(list[i].name, 0, 0); if (!status) break; list[i].available = status; return &list[i]; } return NULL; } static int xfrm_alg_id_match(const struct xfrm_algo_desc *entry, const void *data) { return entry->desc.sadb_alg_id == (unsigned long)data; } struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id) { return xfrm_find_algo(&xfrm_aalg_list, xfrm_alg_id_match, (void *)(unsigned long)alg_id, 1); } EXPORT_SYMBOL_GPL(xfrm_aalg_get_byid); struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id) { return xfrm_find_algo(&xfrm_ealg_list, xfrm_alg_id_match, (void *)(unsigned long)alg_id, 1); } EXPORT_SYMBOL_GPL(xfrm_ealg_get_byid); struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id) { return xfrm_find_algo(&xfrm_calg_list, xfrm_alg_id_match, (void *)(unsigned long)alg_id, 1); } EXPORT_SYMBOL_GPL(xfrm_calg_get_byid); static int xfrm_alg_name_match(const struct xfrm_algo_desc *entry, const void *data) { const char *name = data; return name && (!strcmp(name, entry->name) || (entry->compat && !strcmp(name, entry->compat))); } struct xfrm_algo_desc *xfrm_aalg_get_byname(const char *name, int probe) { return xfrm_find_algo(&xfrm_aalg_list, xfrm_alg_name_match, name, probe); } EXPORT_SYMBOL_GPL(xfrm_aalg_get_byname); struct xfrm_algo_desc *xfrm_ealg_get_byname(const char *name, int probe) { return xfrm_find_algo(&xfrm_ealg_list, xfrm_alg_name_match, name, probe); } EXPORT_SYMBOL_GPL(xfrm_ealg_get_byname); struct xfrm_algo_desc *xfrm_calg_get_byname(const char *name, int probe) { return xfrm_find_algo(&xfrm_calg_list, xfrm_alg_name_match, name, probe); } EXPORT_SYMBOL_GPL(xfrm_calg_get_byname); struct xfrm_aead_name { const char *name; int icvbits; }; static int xfrm_aead_name_match(const struct xfrm_algo_desc *entry, const void *data) { const struct xfrm_aead_name *aead = data; const char *name = aead->name; return aead->icvbits == entry->uinfo.aead.icv_truncbits && name && !strcmp(name, entry->name); } struct xfrm_algo_desc *xfrm_aead_get_byname(const char *name, int icv_len, int probe) { struct xfrm_aead_name data = { .name = name, .icvbits = icv_len, }; return xfrm_find_algo(&xfrm_aead_list, xfrm_aead_name_match, &data, probe); } EXPORT_SYMBOL_GPL(xfrm_aead_get_byname); struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx) { if (idx >= aalg_entries()) return NULL; return &aalg_list[idx]; } EXPORT_SYMBOL_GPL(xfrm_aalg_get_byidx); struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx) { if (idx >= ealg_entries()) return NULL; return &ealg_list[idx]; } EXPORT_SYMBOL_GPL(xfrm_ealg_get_byidx); /* * Probe for the availability of crypto algorithms, and set the available * flag for any algorithms found on the system. This is typically called by * pfkey during userspace SA add, update or register. */ void xfrm_probe_algs(void) { int i, status; BUG_ON(in_softirq()); for (i = 0; i < aalg_entries(); i++) { status = crypto_has_ahash(aalg_list[i].name, 0, 0); if (aalg_list[i].available != status) aalg_list[i].available = status; } for (i = 0; i < ealg_entries(); i++) { status = crypto_has_skcipher(ealg_list[i].name, 0, 0); if (ealg_list[i].available != status) ealg_list[i].available = status; } for (i = 0; i < calg_entries(); i++) { status = crypto_has_comp(calg_list[i].name, 0, CRYPTO_ALG_ASYNC); if (calg_list[i].available != status) calg_list[i].available = status; } } EXPORT_SYMBOL_GPL(xfrm_probe_algs); int xfrm_count_pfkey_auth_supported(void) { int i, n; for (i = 0, n = 0; i < aalg_entries(); i++) if (aalg_list[i].available && aalg_list[i].pfkey_supported) n++; return n; } EXPORT_SYMBOL_GPL(xfrm_count_pfkey_auth_supported); int xfrm_count_pfkey_enc_supported(void) { int i, n; for (i = 0, n = 0; i < ealg_entries(); i++) if (ealg_list[i].available && ealg_list[i].pfkey_supported) n++; return n; } EXPORT_SYMBOL_GPL(xfrm_count_pfkey_enc_supported); MODULE_DESCRIPTION("XFRM Algorithm interface"); MODULE_LICENSE("GPL"); |
20 21 7 1799 1803 1802 1813 2 224 224 61 1791 1803 59 218 158 158 155 1190 1204 54 68 12 13 12 9 9 11 11 1 1 1 1 2 2 7 1 6 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = security_create_user_ns(new); if (ret < 0) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } #if IS_ENABLED(CONFIG_BINFMT_MISC) kfree(ns->binfmt_misc); #endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; /* == 0 unless used with map_id_range_down() */ }; /* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id) { unsigned idx; u32 first, last; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last) return &map->extent[idx]; } return NULL; } /* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id) { struct idmap_key key; key.map_up = true; key.count = 1; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } u32 map_id_up(struct uid_gid_map *map, u32 id) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_up_base(extents, map, id); else extent = map_id_up_max(extents, map, id); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup_array(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init); |
12 4 7 8 2 3 2 20 20 20 1 1 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | // SPDX-License-Identifier: GPL-2.0-only /* * * Generic part shared by ipv4 and ipv6 backends. */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <linux/in.h> #include <net/xfrm.h> static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { [NFTA_XFRM_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DIR] = { .type = NLA_U8 }, [NFTA_XFRM_SPNUM] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_XFRM_DREG] = { .type = NLA_U32 }, }; struct nft_xfrm { enum nft_xfrm_keys key:8; u8 dreg; u8 dir; u8 spnum; u8 len; }; static int nft_xfrm_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int len = 0; u32 spnum = 0; u8 dir; if (!tb[NFTA_XFRM_KEY] || !tb[NFTA_XFRM_DIR] || !tb[NFTA_XFRM_DREG]) return -EINVAL; switch (ctx->family) { case NFPROTO_IPV4: case NFPROTO_IPV6: case NFPROTO_INET: break; default: return -EOPNOTSUPP; } priv->key = ntohl(nla_get_be32(tb[NFTA_XFRM_KEY])); switch (priv->key) { case NFT_XFRM_KEY_REQID: case NFT_XFRM_KEY_SPI: len = sizeof(u32); break; case NFT_XFRM_KEY_DADDR_IP4: case NFT_XFRM_KEY_SADDR_IP4: len = sizeof(struct in_addr); break; case NFT_XFRM_KEY_DADDR_IP6: case NFT_XFRM_KEY_SADDR_IP6: len = sizeof(struct in6_addr); break; default: return -EINVAL; } dir = nla_get_u8(tb[NFTA_XFRM_DIR]); switch (dir) { case XFRM_POLICY_IN: case XFRM_POLICY_OUT: priv->dir = dir; break; default: return -EINVAL; } if (tb[NFTA_XFRM_SPNUM]) spnum = ntohl(nla_get_be32(tb[NFTA_XFRM_SPNUM])); if (spnum >= XFRM_MAX_DEPTH) return -ERANGE; priv->spnum = spnum; priv->len = len; return nft_parse_register_store(ctx, tb[NFTA_XFRM_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); } /* Return true if key asks for daddr/saddr and current * state does have a valid address (BEET, TUNNEL). */ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) { switch (k) { case NFT_XFRM_KEY_DADDR_IP4: case NFT_XFRM_KEY_SADDR_IP4: if (family == NFPROTO_IPV4) break; return false; case NFT_XFRM_KEY_DADDR_IP6: case NFT_XFRM_KEY_SADDR_IP6: if (family == NFPROTO_IPV6) break; return false; default: return true; } return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL; } static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, struct nft_regs *regs, const struct xfrm_state *state) { u32 *dest = ®s->data[priv->dreg]; if (!xfrm_state_addr_ok(priv->key, state->props.family, state->props.mode)) { regs->verdict.code = NFT_BREAK; return; } switch (priv->key) { case NFT_XFRM_KEY_UNSPEC: case __NFT_XFRM_KEY_MAX: WARN_ON_ONCE(1); break; case NFT_XFRM_KEY_DADDR_IP4: *dest = (__force __u32)state->id.daddr.a4; return; case NFT_XFRM_KEY_DADDR_IP6: memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr)); return; case NFT_XFRM_KEY_SADDR_IP4: *dest = (__force __u32)state->props.saddr.a4; return; case NFT_XFRM_KEY_SADDR_IP6: memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr)); return; case NFT_XFRM_KEY_REQID: *dest = state->props.reqid; return; case NFT_XFRM_KEY_SPI: *dest = (__force __u32)state->id.spi; return; } regs->verdict.code = NFT_BREAK; } static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct sec_path *sp = skb_sec_path(pkt->skb); const struct xfrm_state *state; if (sp == NULL || sp->len <= priv->spnum) { regs->verdict.code = NFT_BREAK; return; } state = sp->xvec[priv->spnum]; nft_xfrm_state_get_key(priv, regs, state); } static void nft_xfrm_get_eval_out(const struct nft_xfrm *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct dst_entry *dst = skb_dst(pkt->skb); int i; for (i = 0; dst && dst->xfrm; dst = ((const struct xfrm_dst *)dst)->child, i++) { if (i < priv->spnum) continue; nft_xfrm_state_get_key(priv, regs, dst->xfrm); return; } regs->verdict.code = NFT_BREAK; } static void nft_xfrm_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_xfrm *priv = nft_expr_priv(expr); switch (priv->dir) { case XFRM_POLICY_IN: nft_xfrm_get_eval_in(priv, regs, pkt); break; case XFRM_POLICY_OUT: nft_xfrm_get_eval_out(priv, regs, pkt); break; default: WARN_ON_ONCE(1); regs->verdict.code = NFT_BREAK; break; } } static int nft_xfrm_get_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_xfrm *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_XFRM_DREG, priv->dreg)) return -1; if (nla_put_be32(skb, NFTA_XFRM_KEY, htonl(priv->key))) return -1; if (nla_put_u8(skb, NFTA_XFRM_DIR, priv->dir)) return -1; if (nla_put_be32(skb, NFTA_XFRM_SPNUM, htonl(priv->spnum))) return -1; return 0; } static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); unsigned int hooks; if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; switch (priv->dir) { case XFRM_POLICY_IN: hooks = (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_PRE_ROUTING); break; case XFRM_POLICY_OUT: hooks = (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING); break; default: WARN_ON_ONCE(1); return -EINVAL; } return nft_chain_validate_hooks(ctx->chain, hooks); } static bool nft_xfrm_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_xfrm *priv = nft_expr_priv(expr); const struct nft_xfrm *xfrm; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } xfrm = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->key != xfrm->key || priv->dreg != xfrm->dreg || priv->dir != xfrm->dir || priv->spnum != xfrm->spnum) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } static struct nft_expr_type nft_xfrm_type; static const struct nft_expr_ops nft_xfrm_get_ops = { .type = &nft_xfrm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_xfrm)), .eval = nft_xfrm_get_eval, .init = nft_xfrm_get_init, .dump = nft_xfrm_get_dump, .validate = nft_xfrm_validate, .reduce = nft_xfrm_reduce, }; static struct nft_expr_type nft_xfrm_type __read_mostly = { .name = "xfrm", .ops = &nft_xfrm_get_ops, .policy = nft_xfrm_policy, .maxattr = NFTA_XFRM_MAX, .owner = THIS_MODULE, }; static int __init nft_xfrm_module_init(void) { return nft_register_expr(&nft_xfrm_type); } static void __exit nft_xfrm_module_exit(void) { nft_unregister_expr(&nft_xfrm_type); } module_init(nft_xfrm_module_init); module_exit(nft_xfrm_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("nf_tables: xfrm/IPSec matching"); MODULE_AUTHOR("Florian Westphal <fw@strlen.de>"); MODULE_AUTHOR("Máté Eckl <ecklm94@gmail.com>"); MODULE_ALIAS_NFT_EXPR("xfrm"); |
4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_CMND_H #define _SCSI_SCSI_CMND_H #include <linux/dma-mapping.h> #include <linux/blkdev.h> #include <linux/t10-pi.h> #include <linux/list.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/scatterlist.h> #include <scsi/scsi_device.h> struct Scsi_Host; /* * MAX_COMMAND_SIZE is: * The longest fixed-length SCSI CDB as per the SCSI standard. * fixed-length means: commands that their size can be determined * by their opcode and the CDB does not carry a length specifier, (unlike * the VARIABLE_LENGTH_CMD(0x7f) command). This is actually not exactly * true and the SCSI standard also defines extended commands and * vendor specific commands that can be bigger than 16 bytes. The kernel * will support these using the same infrastructure used for VARLEN CDB's. * So in effect MAX_COMMAND_SIZE means the maximum size command scsi-ml * supports without specifying a cmd_len by ULD's */ #define MAX_COMMAND_SIZE 16 struct scsi_data_buffer { struct sg_table table; unsigned length; }; /* embedded in scsi_cmnd */ struct scsi_pointer { char *ptr; /* data pointer */ int this_residual; /* left in this buffer */ struct scatterlist *buffer; /* which buffer */ int buffers_residual; /* how many buffers left */ dma_addr_t dma_handle; volatile int Status; volatile int Message; volatile int have_data_in; volatile int sent_command; volatile int phase; }; /* for scmd->flags */ #define SCMD_TAGGED (1 << 0) #define SCMD_INITIALIZED (1 << 1) #define SCMD_LAST (1 << 2) /* * libata uses SCSI EH to fetch sense data for successful commands. * SCSI EH should not overwrite scmd->result when SCMD_FORCE_EH_SUCCESS is set. */ #define SCMD_FORCE_EH_SUCCESS (1 << 3) #define SCMD_FAIL_IF_RECOVERING (1 << 4) /* flags preserved across unprep / reprep */ #define SCMD_PRESERVED_FLAGS (SCMD_INITIALIZED | SCMD_FAIL_IF_RECOVERING) /* for scmd->state */ #define SCMD_STATE_COMPLETE 0 #define SCMD_STATE_INFLIGHT 1 enum scsi_cmnd_submitter { SUBMITTED_BY_BLOCK_LAYER = 0, SUBMITTED_BY_SCSI_ERROR_HANDLER = 1, SUBMITTED_BY_SCSI_RESET_IOCTL = 2, } __packed; struct scsi_cmnd { struct scsi_device *device; struct list_head eh_entry; /* entry for the host eh_abort_list/eh_cmd_q */ struct delayed_work abort_work; struct rcu_head rcu; int eh_eflags; /* Used by error handlr */ int budget_token; /* * This is set to jiffies as it was when the command was first * allocated. It is used to time how long the command has * been outstanding */ unsigned long jiffies_at_alloc; int retries; int allowed; unsigned char prot_op; unsigned char prot_type; unsigned char prot_flags; enum scsi_cmnd_submitter submitter; unsigned short cmd_len; enum dma_data_direction sc_data_direction; unsigned char cmnd[32]; /* SCSI CDB */ /* These elements define the operation we ultimately want to perform */ struct scsi_data_buffer sdb; struct scsi_data_buffer *prot_sdb; unsigned underflow; /* Return error if less than this amount is transferred */ unsigned transfersize; /* How much we are guaranteed to transfer with each SCSI transfer (ie, between disconnect / reconnects. Probably == sector size */ unsigned resid_len; /* residual count */ unsigned sense_len; unsigned char *sense_buffer; /* obtained by REQUEST SENSE when * CHECK CONDITION is received on original * command (auto-sense). Length must be * SCSI_SENSE_BUFFERSIZE bytes. */ int flags; /* Command flags */ unsigned long state; /* Command completion state */ unsigned int extra_len; /* length of alignment and padding */ /* * The fields below can be modified by the LLD but the fields above * must not be modified. */ unsigned char *host_scribble; /* The host adapter is allowed to * call scsi_malloc and get some memory * and hang it here. The host adapter * is also expected to call scsi_free * to release this memory. (The memory * obtained by scsi_malloc is guaranteed * to be at an address < 16Mb). */ int result; /* Status code from lower level driver */ }; /* Variant of blk_mq_rq_from_pdu() that verifies the type of its argument. */ static inline struct request *scsi_cmd_to_rq(struct scsi_cmnd *scmd) { return blk_mq_rq_from_pdu(scmd); } /* * Return the driver private allocation behind the command. * Only works if cmd_size is set in the host template. */ static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd) { return cmd + 1; } void scsi_done(struct scsi_cmnd *cmd); void scsi_done_direct(struct scsi_cmnd *cmd); extern void scsi_finish_command(struct scsi_cmnd *cmd); extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count, size_t *offset, size_t *len); extern void scsi_kunmap_atomic_sg(void *virt); blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd); void scsi_free_sgtables(struct scsi_cmnd *cmd); #ifdef CONFIG_SCSI_DMA extern int scsi_dma_map(struct scsi_cmnd *cmd); extern void scsi_dma_unmap(struct scsi_cmnd *cmd); #else /* !CONFIG_SCSI_DMA */ static inline int scsi_dma_map(struct scsi_cmnd *cmd) { return -ENOSYS; } static inline void scsi_dma_unmap(struct scsi_cmnd *cmd) { } #endif /* !CONFIG_SCSI_DMA */ static inline unsigned scsi_sg_count(struct scsi_cmnd *cmd) { return cmd->sdb.table.nents; } static inline struct scatterlist *scsi_sglist(struct scsi_cmnd *cmd) { return cmd->sdb.table.sgl; } static inline unsigned scsi_bufflen(struct scsi_cmnd *cmd) { return cmd->sdb.length; } static inline void scsi_set_resid(struct scsi_cmnd *cmd, unsigned int resid) { cmd->resid_len = resid; } static inline unsigned int scsi_get_resid(struct scsi_cmnd *cmd) { return cmd->resid_len; } #define scsi_for_each_sg(cmd, sg, nseg, __i) \ for_each_sg(scsi_sglist(cmd), sg, nseg, __i) static inline int scsi_sg_copy_from_buffer(struct scsi_cmnd *cmd, const void *buf, int buflen) { return sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, buflen); } static inline int scsi_sg_copy_to_buffer(struct scsi_cmnd *cmd, void *buf, int buflen) { return sg_copy_to_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, buflen); } static inline sector_t scsi_get_sector(struct scsi_cmnd *scmd) { return blk_rq_pos(scsi_cmd_to_rq(scmd)); } static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd) { unsigned int shift = ilog2(scmd->device->sector_size) - SECTOR_SHIFT; return blk_rq_pos(scsi_cmd_to_rq(scmd)) >> shift; } static inline unsigned int scsi_logical_block_count(struct scsi_cmnd *scmd) { unsigned int shift = ilog2(scmd->device->sector_size); return blk_rq_bytes(scsi_cmd_to_rq(scmd)) >> shift; } /* * The operations below are hints that tell the controller driver how * to handle I/Os with DIF or similar types of protection information. */ enum scsi_prot_operations { /* Normal I/O */ SCSI_PROT_NORMAL = 0, /* OS-HBA: Protected, HBA-Target: Unprotected */ SCSI_PROT_READ_INSERT, SCSI_PROT_WRITE_STRIP, /* OS-HBA: Unprotected, HBA-Target: Protected */ SCSI_PROT_READ_STRIP, SCSI_PROT_WRITE_INSERT, /* OS-HBA: Protected, HBA-Target: Protected */ SCSI_PROT_READ_PASS, SCSI_PROT_WRITE_PASS, }; static inline void scsi_set_prot_op(struct scsi_cmnd *scmd, unsigned char op) { scmd->prot_op = op; } static inline unsigned char scsi_get_prot_op(struct scsi_cmnd *scmd) { return scmd->prot_op; } enum scsi_prot_flags { SCSI_PROT_TRANSFER_PI = 1 << 0, SCSI_PROT_GUARD_CHECK = 1 << 1, SCSI_PROT_REF_CHECK = 1 << 2, SCSI_PROT_REF_INCREMENT = 1 << 3, SCSI_PROT_IP_CHECKSUM = 1 << 4, }; /* * The controller usually does not know anything about the target it * is communicating with. However, when DIX is enabled the controller * must be know target type so it can verify the protection * information passed along with the I/O. */ enum scsi_prot_target_type { SCSI_PROT_DIF_TYPE0 = 0, SCSI_PROT_DIF_TYPE1, SCSI_PROT_DIF_TYPE2, SCSI_PROT_DIF_TYPE3, }; static inline void scsi_set_prot_type(struct scsi_cmnd *scmd, unsigned char type) { scmd->prot_type = type; } static inline unsigned char scsi_get_prot_type(struct scsi_cmnd *scmd) { return scmd->prot_type; } static inline u32 scsi_prot_ref_tag(struct scsi_cmnd *scmd) { struct request *rq = blk_mq_rq_from_pdu(scmd); return t10_pi_ref_tag(rq); } static inline unsigned int scsi_prot_interval(struct scsi_cmnd *scmd) { return scmd->device->sector_size; } static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.nents : 0; } static inline struct scatterlist *scsi_prot_sglist(struct scsi_cmnd *cmd) { return cmd->prot_sdb ? cmd->prot_sdb->table.sgl : NULL; } static inline struct scsi_data_buffer *scsi_prot(struct scsi_cmnd *cmd) { return cmd->prot_sdb; } #define scsi_for_each_prot_sg(cmd, sg, nseg, __i) \ for_each_sg(scsi_prot_sglist(cmd), sg, nseg, __i) static inline void set_status_byte(struct scsi_cmnd *cmd, char status) { cmd->result = (cmd->result & 0xffffff00) | status; } static inline u8 get_status_byte(struct scsi_cmnd *cmd) { return cmd->result & 0xff; } static inline void set_host_byte(struct scsi_cmnd *cmd, char status) { cmd->result = (cmd->result & 0xff00ffff) | (status << 16); } static inline u8 get_host_byte(struct scsi_cmnd *cmd) { return (cmd->result >> 16) & 0xff; } /** * scsi_msg_to_host_byte() - translate message byte * @cmd: the SCSI command * @msg: the SCSI parallel message byte to translate * * Translate the SCSI parallel message byte to a matching * host byte setting. A message of COMMAND_COMPLETE indicates * a successful command execution, any other message indicate * an error. As the messages themselves only have a meaning * for the SCSI parallel protocol this function translates * them into a matching host byte value for SCSI EH. */ static inline void scsi_msg_to_host_byte(struct scsi_cmnd *cmd, u8 msg) { switch (msg) { case COMMAND_COMPLETE: break; case ABORT_TASK_SET: set_host_byte(cmd, DID_ABORT); break; case TARGET_RESET: set_host_byte(cmd, DID_RESET); break; default: set_host_byte(cmd, DID_ERROR); break; } } static inline unsigned scsi_transfer_length(struct scsi_cmnd *scmd) { unsigned int xfer_len = scmd->sdb.length; unsigned int prot_interval = scsi_prot_interval(scmd); if (scmd->prot_flags & SCSI_PROT_TRANSFER_PI) xfer_len += (xfer_len >> ilog2(prot_interval)) * 8; return xfer_len; } extern void scsi_build_sense(struct scsi_cmnd *scmd, int desc, u8 key, u8 asc, u8 ascq); struct request *scsi_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags); #endif /* _SCSI_SCSI_CMND_H */ |
2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | // SPDX-License-Identifier: GPL-2.0 /* * SM4 Cipher Algorithm. * * Copyright (C) 2018 ARM Limited or its affiliates. * All rights reserved. */ #include <crypto/algapi.h> #include <crypto/sm4.h> #include <linux/module.h> #include <linux/init.h> #include <linux/types.h> #include <linux/errno.h> #include <asm/byteorder.h> #include <linux/unaligned.h> /** * sm4_setkey - Set the SM4 key. * @tfm: The %crypto_tfm that is used in the context. * @in_key: The input key. * @key_len: The size of the key. * * This function uses sm4_expandkey() to expand the key. * &sm4_ctx _must_ be the private data embedded in @tfm which is * retrieved with crypto_tfm_ctx(). * * Return: 0 on success; -EINVAL on failure (only happens for bad key lengths) */ static int sm4_setkey(struct crypto_tfm *tfm, const u8 *in_key, unsigned int key_len) { struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); return sm4_expandkey(ctx, in_key, key_len); } /* encrypt a block of text */ static void sm4_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); sm4_crypt_block(ctx->rkey_enc, out, in); } /* decrypt a block of text */ static void sm4_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm); sm4_crypt_block(ctx->rkey_dec, out, in); } static struct crypto_alg sm4_alg = { .cra_name = "sm4", .cra_driver_name = "sm4-generic", .cra_priority = 100, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = SM4_BLOCK_SIZE, .cra_ctxsize = sizeof(struct sm4_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = SM4_KEY_SIZE, .cia_max_keysize = SM4_KEY_SIZE, .cia_setkey = sm4_setkey, .cia_encrypt = sm4_encrypt, .cia_decrypt = sm4_decrypt } } }; static int __init sm4_init(void) { return crypto_register_alg(&sm4_alg); } static void __exit sm4_fini(void) { crypto_unregister_alg(&sm4_alg); } subsys_initcall(sm4_init); module_exit(sm4_fini); MODULE_DESCRIPTION("SM4 Cipher Algorithm"); MODULE_LICENSE("GPL v2"); MODULE_ALIAS_CRYPTO("sm4"); MODULE_ALIAS_CRYPTO("sm4-generic"); |
2 21 21 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 | /* * DRBG based on NIST SP800-90A * * Copyright Stephan Mueller <smueller@chronox.de>, 2014 * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #ifndef _DRBG_H #define _DRBG_H #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/module.h> #include <linux/crypto.h> #include <linux/slab.h> #include <crypto/internal/rng.h> #include <crypto/rng.h> #include <linux/fips.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/workqueue.h> /* * Concatenation Helper and string operation helper * * SP800-90A requires the concatenation of different data. To avoid copying * buffers around or allocate additional memory, the following data structure * is used to point to the original memory with its size. In addition, it * is used to build a linked list. The linked list defines the concatenation * of individual buffers. The order of memory block referenced in that * linked list determines the order of concatenation. */ struct drbg_string { const unsigned char *buf; size_t len; struct list_head list; }; static inline void drbg_string_fill(struct drbg_string *string, const unsigned char *buf, size_t len) { string->buf = buf; string->len = len; INIT_LIST_HEAD(&string->list); } struct drbg_state; typedef uint32_t drbg_flag_t; struct drbg_core { drbg_flag_t flags; /* flags for the cipher */ __u8 statelen; /* maximum state length */ __u8 blocklen_bytes; /* block size of output in bytes */ char cra_name[CRYPTO_MAX_ALG_NAME]; /* mapping to kernel crypto API */ /* kernel crypto API backend cipher name */ char backend_cra_name[CRYPTO_MAX_ALG_NAME]; }; struct drbg_state_ops { int (*update)(struct drbg_state *drbg, struct list_head *seed, int reseed); int (*generate)(struct drbg_state *drbg, unsigned char *buf, unsigned int buflen, struct list_head *addtl); int (*crypto_init)(struct drbg_state *drbg); int (*crypto_fini)(struct drbg_state *drbg); }; struct drbg_test_data { struct drbg_string *testentropy; /* TEST PARAMETER: test entropy */ }; enum drbg_seed_state { DRBG_SEED_STATE_UNSEEDED, DRBG_SEED_STATE_PARTIAL, /* Seeded with !rng_is_initialized() */ DRBG_SEED_STATE_FULL, }; struct drbg_state { struct mutex drbg_mutex; /* lock around DRBG */ unsigned char *V; /* internal state 10.1.1.1 1a) */ unsigned char *Vbuf; /* hash: static value 10.1.1.1 1b) hmac / ctr: key */ unsigned char *C; unsigned char *Cbuf; /* Number of RNG requests since last reseed -- 10.1.1.1 1c) */ size_t reseed_ctr; size_t reseed_threshold; /* some memory the DRBG can use for its operation */ unsigned char *scratchpad; unsigned char *scratchpadbuf; void *priv_data; /* Cipher handle */ struct crypto_skcipher *ctr_handle; /* CTR mode cipher handle */ struct skcipher_request *ctr_req; /* CTR mode request handle */ __u8 *outscratchpadbuf; /* CTR mode output scratchpad */ __u8 *outscratchpad; /* CTR mode aligned outbuf */ struct crypto_wait ctr_wait; /* CTR mode async wait obj */ struct scatterlist sg_in, sg_out; /* CTR mode SGLs */ enum drbg_seed_state seeded; /* DRBG fully seeded? */ unsigned long last_seed_time; bool pr; /* Prediction resistance enabled? */ bool fips_primed; /* Continuous test primed? */ unsigned char *prev; /* FIPS 140-2 continuous test value */ struct crypto_rng *jent; const struct drbg_state_ops *d_ops; const struct drbg_core *core; struct drbg_string test_data; }; static inline __u8 drbg_statelen(struct drbg_state *drbg) { if (drbg && drbg->core) return drbg->core->statelen; return 0; } static inline __u8 drbg_blocklen(struct drbg_state *drbg) { if (drbg && drbg->core) return drbg->core->blocklen_bytes; return 0; } static inline __u8 drbg_keylen(struct drbg_state *drbg) { if (drbg && drbg->core) return (drbg->core->statelen - drbg->core->blocklen_bytes); return 0; } static inline size_t drbg_max_request_bytes(struct drbg_state *drbg) { /* SP800-90A requires the limit 2**19 bits, but we return bytes */ return (1 << 16); } static inline size_t drbg_max_addtl(struct drbg_state *drbg) { /* SP800-90A requires 2**35 bytes additional info str / pers str */ #if (__BITS_PER_LONG == 32) /* * SP800-90A allows smaller maximum numbers to be returned -- we * return SIZE_MAX - 1 to allow the verification of the enforcement * of this value in drbg_healthcheck_sanity. */ return (SIZE_MAX - 1); #else return (1UL<<35); #endif } static inline size_t drbg_max_requests(struct drbg_state *drbg) { /* SP800-90A requires 2**48 maximum requests before reseeding */ return (1<<20); } /* * This is a wrapper to the kernel crypto API function of * crypto_rng_generate() to allow the caller to provide additional data. * * @drng DRBG handle -- see crypto_rng_get_bytes * @outbuf output buffer -- see crypto_rng_get_bytes * @outlen length of output buffer -- see crypto_rng_get_bytes * @addtl_input additional information string input buffer * @addtllen length of additional information string buffer * * return * see crypto_rng_get_bytes */ static inline int crypto_drbg_get_bytes_addtl(struct crypto_rng *drng, unsigned char *outbuf, unsigned int outlen, struct drbg_string *addtl) { return crypto_rng_generate(drng, addtl->buf, addtl->len, outbuf, outlen); } /* * TEST code * * This is a wrapper to the kernel crypto API function of * crypto_rng_generate() to allow the caller to provide additional data and * allow furnishing of test_data * * @drng DRBG handle -- see crypto_rng_get_bytes * @outbuf output buffer -- see crypto_rng_get_bytes * @outlen length of output buffer -- see crypto_rng_get_bytes * @addtl_input additional information string input buffer * @addtllen length of additional information string buffer * @test_data filled test data * * return * see crypto_rng_get_bytes */ static inline int crypto_drbg_get_bytes_addtl_test(struct crypto_rng *drng, unsigned char *outbuf, unsigned int outlen, struct drbg_string *addtl, struct drbg_test_data *test_data) { crypto_rng_set_entropy(drng, test_data->testentropy->buf, test_data->testentropy->len); return crypto_rng_generate(drng, addtl->buf, addtl->len, outbuf, outlen); } /* * TEST code * * This is a wrapper to the kernel crypto API function of * crypto_rng_reset() to allow the caller to provide test_data * * @drng DRBG handle -- see crypto_rng_reset * @pers personalization string input buffer * @perslen length of additional information string buffer * @test_data filled test data * * return * see crypto_rng_reset */ static inline int crypto_drbg_reset_test(struct crypto_rng *drng, struct drbg_string *pers, struct drbg_test_data *test_data) { crypto_rng_set_entropy(drng, test_data->testentropy->buf, test_data->testentropy->len); return crypto_rng_reset(drng, pers->buf, pers->len); } /* DRBG type flags */ #define DRBG_CTR ((drbg_flag_t)1<<0) #define DRBG_HMAC ((drbg_flag_t)1<<1) #define DRBG_HASH ((drbg_flag_t)1<<2) #define DRBG_TYPE_MASK (DRBG_CTR | DRBG_HMAC | DRBG_HASH) /* DRBG strength flags */ #define DRBG_STRENGTH128 ((drbg_flag_t)1<<3) #define DRBG_STRENGTH192 ((drbg_flag_t)1<<4) #define DRBG_STRENGTH256 ((drbg_flag_t)1<<5) #define DRBG_STRENGTH_MASK (DRBG_STRENGTH128 | DRBG_STRENGTH192 | \ DRBG_STRENGTH256) enum drbg_prefixes { DRBG_PREFIX0 = 0x00, DRBG_PREFIX1, DRBG_PREFIX2, DRBG_PREFIX3 }; #endif /* _DRBG_H */ |
4 1 2 1 2 1 3 2 1 1 1 1 1 1 1 3 2 1 2 1 1 1 1 1 1 1 4 3 1 2 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 | // SPDX-License-Identifier: GPL-2.0+ /* * IPv6 IOAM implementation * * Author: * Justin Iurman <justin.iurman@uliege.be> */ #include <linux/errno.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/net.h> #include <linux/ioam6.h> #include <linux/ioam6_genl.h> #include <linux/rhashtable.h> #include <linux/netdevice.h> #include <net/addrconf.h> #include <net/genetlink.h> #include <net/ioam6.h> #include <net/sch_generic.h> static void ioam6_ns_release(struct ioam6_namespace *ns) { kfree_rcu(ns, rcu); } static void ioam6_sc_release(struct ioam6_schema *sc) { kfree_rcu(sc, rcu); } static void ioam6_free_ns(void *ptr, void *arg) { struct ioam6_namespace *ns = (struct ioam6_namespace *)ptr; if (ns) ioam6_ns_release(ns); } static void ioam6_free_sc(void *ptr, void *arg) { struct ioam6_schema *sc = (struct ioam6_schema *)ptr; if (sc) ioam6_sc_release(sc); } static int ioam6_ns_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { const struct ioam6_namespace *ns = obj; return (ns->id != *(__be16 *)arg->key); } static int ioam6_sc_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { const struct ioam6_schema *sc = obj; return (sc->id != *(u32 *)arg->key); } static const struct rhashtable_params rht_ns_params = { .key_len = sizeof(__be16), .key_offset = offsetof(struct ioam6_namespace, id), .head_offset = offsetof(struct ioam6_namespace, head), .automatic_shrinking = true, .obj_cmpfn = ioam6_ns_cmpfn, }; static const struct rhashtable_params rht_sc_params = { .key_len = sizeof(u32), .key_offset = offsetof(struct ioam6_schema, id), .head_offset = offsetof(struct ioam6_schema, head), .automatic_shrinking = true, .obj_cmpfn = ioam6_sc_cmpfn, }; static struct genl_family ioam6_genl_family; static const struct nla_policy ioam6_genl_policy_addns[] = { [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 }, [IOAM6_ATTR_NS_DATA] = { .type = NLA_U32 }, [IOAM6_ATTR_NS_DATA_WIDE] = { .type = NLA_U64 }, }; static const struct nla_policy ioam6_genl_policy_delns[] = { [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 }, }; static const struct nla_policy ioam6_genl_policy_addsc[] = { [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 }, [IOAM6_ATTR_SC_DATA] = { .type = NLA_BINARY, .len = IOAM6_MAX_SCHEMA_DATA_LEN }, }; static const struct nla_policy ioam6_genl_policy_delsc[] = { [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 }, }; static const struct nla_policy ioam6_genl_policy_ns_sc[] = { [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 }, [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 }, [IOAM6_ATTR_SC_NONE] = { .type = NLA_FLAG }, }; static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info) { struct ioam6_pernet_data *nsdata; struct ioam6_namespace *ns; u64 data64; u32 data32; __be16 id; int err; if (!info->attrs[IOAM6_ATTR_NS_ID]) return -EINVAL; id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID])); nsdata = ioam6_pernet(genl_info_net(info)); mutex_lock(&nsdata->lock); ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); if (ns) { err = -EEXIST; goto out_unlock; } ns = kzalloc(sizeof(*ns), GFP_KERNEL); if (!ns) { err = -ENOMEM; goto out_unlock; } ns->id = id; if (!info->attrs[IOAM6_ATTR_NS_DATA]) data32 = IOAM6_U32_UNAVAILABLE; else data32 = nla_get_u32(info->attrs[IOAM6_ATTR_NS_DATA]); if (!info->attrs[IOAM6_ATTR_NS_DATA_WIDE]) data64 = IOAM6_U64_UNAVAILABLE; else data64 = nla_get_u64(info->attrs[IOAM6_ATTR_NS_DATA_WIDE]); ns->data = cpu_to_be32(data32); ns->data_wide = cpu_to_be64(data64); err = rhashtable_lookup_insert_fast(&nsdata->namespaces, &ns->head, rht_ns_params); if (err) kfree(ns); out_unlock: mutex_unlock(&nsdata->lock); return err; } static int ioam6_genl_delns(struct sk_buff *skb, struct genl_info *info) { struct ioam6_pernet_data *nsdata; struct ioam6_namespace *ns; struct ioam6_schema *sc; __be16 id; int err; if (!info->attrs[IOAM6_ATTR_NS_ID]) return -EINVAL; id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID])); nsdata = ioam6_pernet(genl_info_net(info)); mutex_lock(&nsdata->lock); ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); if (!ns) { err = -ENOENT; goto out_unlock; } sc = rcu_dereference_protected(ns->schema, lockdep_is_held(&nsdata->lock)); err = rhashtable_remove_fast(&nsdata->namespaces, &ns->head, rht_ns_params); if (err) goto out_unlock; if (sc) rcu_assign_pointer(sc->ns, NULL); ioam6_ns_release(ns); out_unlock: mutex_unlock(&nsdata->lock); return err; } static int __ioam6_genl_dumpns_element(struct ioam6_namespace *ns, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { struct ioam6_schema *sc; u64 data64; u32 data32; void *hdr; hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd); if (!hdr) return -ENOMEM; data32 = be32_to_cpu(ns->data); data64 = be64_to_cpu(ns->data_wide); if (nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id)) || (data32 != IOAM6_U32_UNAVAILABLE && nla_put_u32(skb, IOAM6_ATTR_NS_DATA, data32)) || (data64 != IOAM6_U64_UNAVAILABLE && nla_put_u64_64bit(skb, IOAM6_ATTR_NS_DATA_WIDE, data64, IOAM6_ATTR_PAD))) goto nla_put_failure; rcu_read_lock(); sc = rcu_dereference(ns->schema); if (sc && nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id)) { rcu_read_unlock(); goto nla_put_failure; } rcu_read_unlock(); genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ioam6_genl_dumpns_start(struct netlink_callback *cb) { struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk)); struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; if (!iter) { iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long)iter; } rhashtable_walk_enter(&nsdata->namespaces, iter); return 0; } static int ioam6_genl_dumpns_done(struct netlink_callback *cb) { struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; rhashtable_walk_exit(iter); kfree(iter); return 0; } static int ioam6_genl_dumpns(struct sk_buff *skb, struct netlink_callback *cb) { struct rhashtable_iter *iter; struct ioam6_namespace *ns; int err; iter = (struct rhashtable_iter *)cb->args[0]; rhashtable_walk_start(iter); for (;;) { ns = rhashtable_walk_next(iter); if (IS_ERR(ns)) { if (PTR_ERR(ns) == -EAGAIN) continue; err = PTR_ERR(ns); goto done; } else if (!ns) { break; } err = __ioam6_genl_dumpns_element(ns, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, IOAM6_CMD_DUMP_NAMESPACES); if (err) goto done; } err = skb->len; done: rhashtable_walk_stop(iter); return err; } static int ioam6_genl_addsc(struct sk_buff *skb, struct genl_info *info) { struct ioam6_pernet_data *nsdata; int len, len_aligned, err; struct ioam6_schema *sc; u32 id; if (!info->attrs[IOAM6_ATTR_SC_ID] || !info->attrs[IOAM6_ATTR_SC_DATA]) return -EINVAL; id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]); nsdata = ioam6_pernet(genl_info_net(info)); mutex_lock(&nsdata->lock); sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params); if (sc) { err = -EEXIST; goto out_unlock; } len = nla_len(info->attrs[IOAM6_ATTR_SC_DATA]); len_aligned = ALIGN(len, 4); sc = kzalloc(sizeof(*sc) + len_aligned, GFP_KERNEL); if (!sc) { err = -ENOMEM; goto out_unlock; } sc->id = id; sc->len = len_aligned; sc->hdr = cpu_to_be32(sc->id | ((u8)(sc->len / 4) << 24)); nla_memcpy(sc->data, info->attrs[IOAM6_ATTR_SC_DATA], len); err = rhashtable_lookup_insert_fast(&nsdata->schemas, &sc->head, rht_sc_params); if (err) goto free_sc; out_unlock: mutex_unlock(&nsdata->lock); return err; free_sc: kfree(sc); goto out_unlock; } static int ioam6_genl_delsc(struct sk_buff *skb, struct genl_info *info) { struct ioam6_pernet_data *nsdata; struct ioam6_namespace *ns; struct ioam6_schema *sc; int err; u32 id; if (!info->attrs[IOAM6_ATTR_SC_ID]) return -EINVAL; id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]); nsdata = ioam6_pernet(genl_info_net(info)); mutex_lock(&nsdata->lock); sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params); if (!sc) { err = -ENOENT; goto out_unlock; } ns = rcu_dereference_protected(sc->ns, lockdep_is_held(&nsdata->lock)); err = rhashtable_remove_fast(&nsdata->schemas, &sc->head, rht_sc_params); if (err) goto out_unlock; if (ns) rcu_assign_pointer(ns->schema, NULL); ioam6_sc_release(sc); out_unlock: mutex_unlock(&nsdata->lock); return err; } static int __ioam6_genl_dumpsc_element(struct ioam6_schema *sc, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { struct ioam6_namespace *ns; void *hdr; hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd); if (!hdr) return -ENOMEM; if (nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id) || nla_put(skb, IOAM6_ATTR_SC_DATA, sc->len, sc->data)) goto nla_put_failure; rcu_read_lock(); ns = rcu_dereference(sc->ns); if (ns && nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id))) { rcu_read_unlock(); goto nla_put_failure; } rcu_read_unlock(); genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int ioam6_genl_dumpsc_start(struct netlink_callback *cb) { struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk)); struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; if (!iter) { iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long)iter; } rhashtable_walk_enter(&nsdata->schemas, iter); return 0; } static int ioam6_genl_dumpsc_done(struct netlink_callback *cb) { struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; rhashtable_walk_exit(iter); kfree(iter); return 0; } static int ioam6_genl_dumpsc(struct sk_buff *skb, struct netlink_callback *cb) { struct rhashtable_iter *iter; struct ioam6_schema *sc; int err; iter = (struct rhashtable_iter *)cb->args[0]; rhashtable_walk_start(iter); for (;;) { sc = rhashtable_walk_next(iter); if (IS_ERR(sc)) { if (PTR_ERR(sc) == -EAGAIN) continue; err = PTR_ERR(sc); goto done; } else if (!sc) { break; } err = __ioam6_genl_dumpsc_element(sc, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, IOAM6_CMD_DUMP_SCHEMAS); if (err) goto done; } err = skb->len; done: rhashtable_walk_stop(iter); return err; } static int ioam6_genl_ns_set_schema(struct sk_buff *skb, struct genl_info *info) { struct ioam6_namespace *ns, *ns_ref; struct ioam6_schema *sc, *sc_ref; struct ioam6_pernet_data *nsdata; __be16 ns_id; u32 sc_id; int err; if (!info->attrs[IOAM6_ATTR_NS_ID] || (!info->attrs[IOAM6_ATTR_SC_ID] && !info->attrs[IOAM6_ATTR_SC_NONE])) return -EINVAL; ns_id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID])); nsdata = ioam6_pernet(genl_info_net(info)); mutex_lock(&nsdata->lock); ns = rhashtable_lookup_fast(&nsdata->namespaces, &ns_id, rht_ns_params); if (!ns) { err = -ENOENT; goto out_unlock; } if (info->attrs[IOAM6_ATTR_SC_NONE]) { sc = NULL; } else { sc_id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]); sc = rhashtable_lookup_fast(&nsdata->schemas, &sc_id, rht_sc_params); if (!sc) { err = -ENOENT; goto out_unlock; } } sc_ref = rcu_dereference_protected(ns->schema, lockdep_is_held(&nsdata->lock)); if (sc_ref) rcu_assign_pointer(sc_ref->ns, NULL); rcu_assign_pointer(ns->schema, sc); if (sc) { ns_ref = rcu_dereference_protected(sc->ns, lockdep_is_held(&nsdata->lock)); if (ns_ref) rcu_assign_pointer(ns_ref->schema, NULL); rcu_assign_pointer(sc->ns, ns); } err = 0; out_unlock: mutex_unlock(&nsdata->lock); return err; } static const struct genl_ops ioam6_genl_ops[] = { { .cmd = IOAM6_CMD_ADD_NAMESPACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ioam6_genl_addns, .flags = GENL_ADMIN_PERM, .policy = ioam6_genl_policy_addns, .maxattr = ARRAY_SIZE(ioam6_genl_policy_addns) - 1, }, { .cmd = IOAM6_CMD_DEL_NAMESPACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ioam6_genl_delns, .flags = GENL_ADMIN_PERM, .policy = ioam6_genl_policy_delns, .maxattr = ARRAY_SIZE(ioam6_genl_policy_delns) - 1, }, { .cmd = IOAM6_CMD_DUMP_NAMESPACES, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .start = ioam6_genl_dumpns_start, .dumpit = ioam6_genl_dumpns, .done = ioam6_genl_dumpns_done, .flags = GENL_ADMIN_PERM, }, { .cmd = IOAM6_CMD_ADD_SCHEMA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ioam6_genl_addsc, .flags = GENL_ADMIN_PERM, .policy = ioam6_genl_policy_addsc, .maxattr = ARRAY_SIZE(ioam6_genl_policy_addsc) - 1, }, { .cmd = IOAM6_CMD_DEL_SCHEMA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ioam6_genl_delsc, .flags = GENL_ADMIN_PERM, .policy = ioam6_genl_policy_delsc, .maxattr = ARRAY_SIZE(ioam6_genl_policy_delsc) - 1, }, { .cmd = IOAM6_CMD_DUMP_SCHEMAS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .start = ioam6_genl_dumpsc_start, .dumpit = ioam6_genl_dumpsc, .done = ioam6_genl_dumpsc_done, .flags = GENL_ADMIN_PERM, }, { .cmd = IOAM6_CMD_NS_SET_SCHEMA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ioam6_genl_ns_set_schema, .flags = GENL_ADMIN_PERM, .policy = ioam6_genl_policy_ns_sc, .maxattr = ARRAY_SIZE(ioam6_genl_policy_ns_sc) - 1, }, }; #define IOAM6_GENL_EV_GRP_OFFSET 0 static const struct genl_multicast_group ioam6_mcgrps[] = { [IOAM6_GENL_EV_GRP_OFFSET] = { .name = IOAM6_GENL_EV_GRP_NAME, .flags = GENL_MCAST_CAP_NET_ADMIN }, }; static int ioam6_event_put_trace(struct sk_buff *skb, struct ioam6_trace_hdr *trace, unsigned int len) { if (nla_put_u16(skb, IOAM6_EVENT_ATTR_TRACE_NAMESPACE, be16_to_cpu(trace->namespace_id)) || nla_put_u8(skb, IOAM6_EVENT_ATTR_TRACE_NODELEN, trace->nodelen) || nla_put_u32(skb, IOAM6_EVENT_ATTR_TRACE_TYPE, be32_to_cpu(trace->type_be32)) || nla_put(skb, IOAM6_EVENT_ATTR_TRACE_DATA, len - sizeof(struct ioam6_trace_hdr) - trace->remlen * 4, trace->data + trace->remlen * 4)) return 1; return 0; } void ioam6_event(enum ioam6_event_type type, struct net *net, gfp_t gfp, void *opt, unsigned int opt_len) { struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&ioam6_genl_family, net, IOAM6_GENL_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &ioam6_genl_family, 0, type); if (!nlh) goto nla_put_failure; switch (type) { case IOAM6_EVENT_UNSPEC: WARN_ON_ONCE(1); break; case IOAM6_EVENT_TRACE: if (ioam6_event_put_trace(skb, (struct ioam6_trace_hdr *)opt, opt_len)) goto nla_put_failure; break; } genlmsg_end(skb, nlh); genlmsg_multicast_netns(&ioam6_genl_family, net, skb, 0, IOAM6_GENL_EV_GRP_OFFSET, gfp); return; nla_put_failure: nlmsg_free(skb); } static struct genl_family ioam6_genl_family __ro_after_init = { .name = IOAM6_GENL_NAME, .version = IOAM6_GENL_VERSION, .netnsok = true, .parallel_ops = true, .ops = ioam6_genl_ops, .n_ops = ARRAY_SIZE(ioam6_genl_ops), .resv_start_op = IOAM6_CMD_NS_SET_SCHEMA + 1, .mcgrps = ioam6_mcgrps, .n_mcgrps = ARRAY_SIZE(ioam6_mcgrps), .module = THIS_MODULE, }; struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id) { struct ioam6_pernet_data *nsdata = ioam6_pernet(net); return rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); } static void __ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_namespace *ns, struct ioam6_trace_hdr *trace, struct ioam6_schema *sc, u8 sclen, bool is_input) { struct timespec64 ts; ktime_t tstamp; u64 raw64; u32 raw32; u16 raw16; u8 *data; u8 byte; data = trace->data + trace->remlen * 4 - trace->nodelen * 4 - sclen * 4; /* hop_lim and node_id */ if (trace->type.bit0) { byte = ipv6_hdr(skb)->hop_limit; if (is_input) byte--; raw32 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id; *(__be32 *)data = cpu_to_be32((byte << 24) | raw32); data += sizeof(__be32); } /* ingress_if_id and egress_if_id */ if (trace->type.bit1) { if (!skb->dev) raw16 = IOAM6_U16_UNAVAILABLE; else raw16 = (__force u16)READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_id); *(__be16 *)data = cpu_to_be16(raw16); data += sizeof(__be16); if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) raw16 = IOAM6_U16_UNAVAILABLE; else raw16 = (__force u16)READ_ONCE(__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id); *(__be16 *)data = cpu_to_be16(raw16); data += sizeof(__be16); } /* timestamp seconds */ if (trace->type.bit2) { if (!skb->dev) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); } else { tstamp = skb_tstamp_cond(skb, true); ts = ktime_to_timespec64(tstamp); *(__be32 *)data = cpu_to_be32((u32)ts.tv_sec); } data += sizeof(__be32); } /* timestamp subseconds */ if (trace->type.bit3) { if (!skb->dev) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); } else { if (!trace->type.bit2) { tstamp = skb_tstamp_cond(skb, true); ts = ktime_to_timespec64(tstamp); } *(__be32 *)data = cpu_to_be32((u32)(ts.tv_nsec / NSEC_PER_USEC)); } data += sizeof(__be32); } /* transit delay */ if (trace->type.bit4) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* namespace data */ if (trace->type.bit5) { *(__be32 *)data = ns->data; data += sizeof(__be32); } /* queue depth */ if (trace->type.bit6) { struct netdev_queue *queue; struct Qdisc *qdisc; __u32 qlen, backlog; if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); } else { queue = skb_get_tx_queue(skb_dst(skb)->dev, skb); qdisc = rcu_dereference(queue->qdisc); qdisc_qstats_qlen_backlog(qdisc, &qlen, &backlog); *(__be32 *)data = cpu_to_be32(backlog); } data += sizeof(__be32); } /* checksum complement */ if (trace->type.bit7) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* hop_lim and node_id (wide) */ if (trace->type.bit8) { byte = ipv6_hdr(skb)->hop_limit; if (is_input) byte--; raw64 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id_wide; *(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64); data += sizeof(__be64); } /* ingress_if_id and egress_if_id (wide) */ if (trace->type.bit9) { if (!skb->dev) raw32 = IOAM6_U32_UNAVAILABLE; else raw32 = READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_id_wide); *(__be32 *)data = cpu_to_be32(raw32); data += sizeof(__be32); if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) raw32 = IOAM6_U32_UNAVAILABLE; else raw32 = READ_ONCE(__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide); *(__be32 *)data = cpu_to_be32(raw32); data += sizeof(__be32); } /* namespace data (wide) */ if (trace->type.bit10) { *(__be64 *)data = ns->data_wide; data += sizeof(__be64); } /* buffer occupancy */ if (trace->type.bit11) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* bit12 undefined: filled with empty value */ if (trace->type.bit12) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* bit13 undefined: filled with empty value */ if (trace->type.bit13) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* bit14 undefined: filled with empty value */ if (trace->type.bit14) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* bit15 undefined: filled with empty value */ if (trace->type.bit15) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* bit16 undefined: filled with empty value */ if (trace->type.bit16) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* bit17 undefined: filled with empty value */ if (trace->type.bit17) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* bit18 undefined: filled with empty value */ if (trace->type.bit18) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* bit19 undefined: filled with empty value */ if (trace->type.bit19) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* bit20 undefined: filled with empty value */ if (trace->type.bit20) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* bit21 undefined: filled with empty value */ if (trace->type.bit21) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); data += sizeof(__be32); } /* opaque state snapshot */ if (trace->type.bit22) { if (!sc) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE >> 8); } else { *(__be32 *)data = sc->hdr; data += sizeof(__be32); memcpy(data, sc->data, sc->len); } } } /* called with rcu_read_lock() */ void ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_namespace *ns, struct ioam6_trace_hdr *trace, bool is_input) { struct ioam6_schema *sc; u8 sclen = 0; /* Skip if Overflow flag is set */ if (trace->overflow) return; /* NodeLen does not include Opaque State Snapshot length. We need to * take it into account if the corresponding bit is set (bit 22) and * if the current IOAM namespace has an active schema attached to it */ sc = rcu_dereference(ns->schema); if (trace->type.bit22) { sclen = sizeof_field(struct ioam6_schema, hdr) / 4; if (sc) sclen += sc->len / 4; } /* If there is no space remaining, we set the Overflow flag and we * skip without filling the trace */ if (!trace->remlen || trace->remlen < trace->nodelen + sclen) { trace->overflow = 1; return; } __ioam6_fill_trace_data(skb, ns, trace, sc, sclen, is_input); trace->remlen -= trace->nodelen + sclen; } static int __net_init ioam6_net_init(struct net *net) { struct ioam6_pernet_data *nsdata; int err = -ENOMEM; nsdata = kzalloc(sizeof(*nsdata), GFP_KERNEL); if (!nsdata) goto out; mutex_init(&nsdata->lock); net->ipv6.ioam6_data = nsdata; err = rhashtable_init(&nsdata->namespaces, &rht_ns_params); if (err) goto free_nsdata; err = rhashtable_init(&nsdata->schemas, &rht_sc_params); if (err) goto free_rht_ns; out: return err; free_rht_ns: rhashtable_destroy(&nsdata->namespaces); free_nsdata: kfree(nsdata); net->ipv6.ioam6_data = NULL; goto out; } static void __net_exit ioam6_net_exit(struct net *net) { struct ioam6_pernet_data *nsdata = ioam6_pernet(net); rhashtable_free_and_destroy(&nsdata->namespaces, ioam6_free_ns, NULL); rhashtable_free_and_destroy(&nsdata->schemas, ioam6_free_sc, NULL); kfree(nsdata); } static struct pernet_operations ioam6_net_ops = { .init = ioam6_net_init, .exit = ioam6_net_exit, }; int __init ioam6_init(void) { int err = register_pernet_subsys(&ioam6_net_ops); if (err) goto out; err = genl_register_family(&ioam6_genl_family); if (err) goto out_unregister_pernet_subsys; #ifdef CONFIG_IPV6_IOAM6_LWTUNNEL err = ioam6_iptunnel_init(); if (err) goto out_unregister_genl; #endif pr_info("In-situ OAM (IOAM) with IPv6\n"); out: return err; #ifdef CONFIG_IPV6_IOAM6_LWTUNNEL out_unregister_genl: genl_unregister_family(&ioam6_genl_family); #endif out_unregister_pernet_subsys: unregister_pernet_subsys(&ioam6_net_ops); goto out; } void ioam6_exit(void) { #ifdef CONFIG_IPV6_IOAM6_LWTUNNEL ioam6_iptunnel_exit(); #endif genl_unregister_family(&ioam6_genl_family); unregister_pernet_subsys(&ioam6_net_ops); } |
6 1 1 1 3 1 2 2 2 1 1 1 1 2 1 2 2 2 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Checksum updating actions * * Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org> */ #include <linux/types.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/netlink.h> #include <net/netlink.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/icmp.h> #include <linux/icmpv6.h> #include <linux/igmp.h> #include <net/tcp.h> #include <net/udp.h> #include <net/ip6_checksum.h> #include <net/sctp/checksum.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <linux/tc_act/tc_csum.h> #include <net/tc_act/tc_csum.h> #include <net/tc_wrapper.h> static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, }; static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_csum_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_csum_params *params_new; struct nlattr *tb[TCA_CSUM_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_csum *parm; struct tcf_csum *p; int ret = 0, err; u32 index; if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_CSUM_MAX, nla, csum_policy, NULL); if (err < 0) return err; if (tb[TCA_CSUM_PARMS] == NULL) return -EINVAL; parm = nla_data(tb[TCA_CSUM_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_csum_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) /* dont override defaults */ return ACT_P_BOUND; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } } else { return err; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; p = to_tcf_csum(*a); params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { err = -ENOMEM; goto put_chain; } params_new->update_flags = parm->update_flags; spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(p->params, params_new, lockdep_is_held(&p->tcf_lock)); spin_unlock_bh(&p->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (params_new) kfree_rcu(params_new, rcu); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } /** * tcf_csum_skb_nextlayer - Get next layer pointer * @skb: sk_buff to use * @ihl: previous summed headers length * @ipl: complete packet length * @jhl: next header length * * Check the expected next layer availability in the specified sk_buff. * Return the next layer pointer if pass, NULL otherwise. */ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb, unsigned int ihl, unsigned int ipl, unsigned int jhl) { int ntkoff = skb_network_offset(skb); int hl = ihl + jhl; if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) || skb_try_make_writable(skb, hl + ntkoff)) return NULL; else return (void *)(skb_network_header(skb) + ihl); } static int tcf_csum_ipv4_icmp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl) { struct icmphdr *icmph; icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph)); if (icmph == NULL) return 0; icmph->checksum = 0; skb->csum = csum_partial(icmph, ipl - ihl, 0); icmph->checksum = csum_fold(skb->csum); skb->ip_summed = CHECKSUM_NONE; return 1; } static int tcf_csum_ipv4_igmp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl) { struct igmphdr *igmph; igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph)); if (igmph == NULL) return 0; igmph->csum = 0; skb->csum = csum_partial(igmph, ipl - ihl, 0); igmph->csum = csum_fold(skb->csum); skb->ip_summed = CHECKSUM_NONE; return 1; } static int tcf_csum_ipv6_icmp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl) { struct icmp6hdr *icmp6h; const struct ipv6hdr *ip6h; icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h)); if (icmp6h == NULL) return 0; ip6h = ipv6_hdr(skb); icmp6h->icmp6_cksum = 0; skb->csum = csum_partial(icmp6h, ipl - ihl, 0); icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ipl - ihl, IPPROTO_ICMPV6, skb->csum); skb->ip_summed = CHECKSUM_NONE; return 1; } static int tcf_csum_ipv4_tcp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl) { struct tcphdr *tcph; const struct iphdr *iph; if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) return 1; tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); if (tcph == NULL) return 0; iph = ip_hdr(skb); tcph->check = 0; skb->csum = csum_partial(tcph, ipl - ihl, 0); tcph->check = tcp_v4_check(ipl - ihl, iph->saddr, iph->daddr, skb->csum); skb->ip_summed = CHECKSUM_NONE; return 1; } static int tcf_csum_ipv6_tcp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl) { struct tcphdr *tcph; const struct ipv6hdr *ip6h; if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) return 1; tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); if (tcph == NULL) return 0; ip6h = ipv6_hdr(skb); tcph->check = 0; skb->csum = csum_partial(tcph, ipl - ihl, 0); tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ipl - ihl, IPPROTO_TCP, skb->csum); skb->ip_summed = CHECKSUM_NONE; return 1; } static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl, int udplite) { struct udphdr *udph; const struct iphdr *iph; u16 ul; if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) return 1; /* * Support both UDP and UDPLITE checksum algorithms, Don't use * udph->len to get the real length without any protocol check, * UDPLITE uses udph->len for another thing, * Use iph->tot_len, or just ipl. */ udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph)); if (udph == NULL) return 0; iph = ip_hdr(skb); ul = ntohs(udph->len); if (udplite || udph->check) { udph->check = 0; if (udplite) { if (ul == 0) skb->csum = csum_partial(udph, ipl - ihl, 0); else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) skb->csum = csum_partial(udph, ul, 0); else goto ignore_obscure_skb; } else { if (ul != ipl - ihl) goto ignore_obscure_skb; skb->csum = csum_partial(udph, ul, 0); } udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, ul, iph->protocol, skb->csum); if (!udph->check) udph->check = CSUM_MANGLED_0; } skb->ip_summed = CHECKSUM_NONE; ignore_obscure_skb: return 1; } static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl, int udplite) { struct udphdr *udph; const struct ipv6hdr *ip6h; u16 ul; if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) return 1; /* * Support both UDP and UDPLITE checksum algorithms, Don't use * udph->len to get the real length without any protocol check, * UDPLITE uses udph->len for another thing, * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl. */ udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph)); if (udph == NULL) return 0; ip6h = ipv6_hdr(skb); ul = ntohs(udph->len); udph->check = 0; if (udplite) { if (ul == 0) skb->csum = csum_partial(udph, ipl - ihl, 0); else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) skb->csum = csum_partial(udph, ul, 0); else goto ignore_obscure_skb; } else { if (ul != ipl - ihl) goto ignore_obscure_skb; skb->csum = csum_partial(udph, ul, 0); } udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul, udplite ? IPPROTO_UDPLITE : IPPROTO_UDP, skb->csum); if (!udph->check) udph->check = CSUM_MANGLED_0; skb->ip_summed = CHECKSUM_NONE; ignore_obscure_skb: return 1; } static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl, unsigned int ipl) { struct sctphdr *sctph; if (skb_is_gso(skb) && skb_is_gso_sctp(skb)) return 1; sctph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*sctph)); if (!sctph) return 0; sctph->checksum = sctp_compute_cksum(skb, skb_network_offset(skb) + ihl); skb_reset_csum_not_inet(skb); return 1; } static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) { const struct iphdr *iph; int ntkoff; ntkoff = skb_network_offset(skb); if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff)) goto fail; iph = ip_hdr(skb); switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) { case IPPROTO_ICMP: if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4, ntohs(iph->tot_len))) goto fail; break; case IPPROTO_IGMP: if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP) if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4, ntohs(iph->tot_len))) goto fail; break; case IPPROTO_TCP: if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) if (!tcf_csum_ipv4_tcp(skb, iph->ihl * 4, ntohs(iph->tot_len))) goto fail; break; case IPPROTO_UDP: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4, ntohs(iph->tot_len), 0)) goto fail; break; case IPPROTO_UDPLITE: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) if (!tcf_csum_ipv4_udp(skb, iph->ihl * 4, ntohs(iph->tot_len), 1)) goto fail; break; case IPPROTO_SCTP: if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) && !tcf_csum_sctp(skb, iph->ihl * 4, ntohs(iph->tot_len))) goto fail; break; } if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { if (skb_try_make_writable(skb, sizeof(*iph) + ntkoff)) goto fail; ip_send_check(ip_hdr(skb)); } return 1; fail: return 0; } static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, unsigned int ixhl, unsigned int *pl) { int off, len, optlen; unsigned char *xh = (void *)ip6xh; off = sizeof(*ip6xh); len = ixhl - off; while (len > 1) { switch (xh[off]) { case IPV6_TLV_PAD1: optlen = 1; break; case IPV6_TLV_JUMBO: optlen = xh[off + 1] + 2; if (optlen != 6 || len < 6 || (off & 3) != 2) /* wrong jumbo option length/alignment */ return 0; *pl = ntohl(*(__be32 *)(xh + off + 2)); goto done; default: optlen = xh[off + 1] + 2; if (optlen > len) /* ignore obscure options */ goto done; break; } off += optlen; len -= optlen; } done: return 1; } static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) { struct ipv6hdr *ip6h; struct ipv6_opt_hdr *ip6xh; unsigned int hl, ixhl; unsigned int pl; int ntkoff; u8 nexthdr; ntkoff = skb_network_offset(skb); hl = sizeof(*ip6h); if (!pskb_may_pull(skb, hl + ntkoff)) goto fail; ip6h = ipv6_hdr(skb); pl = ntohs(ip6h->payload_len); nexthdr = ip6h->nexthdr; do { switch (nexthdr) { case NEXTHDR_FRAGMENT: goto ignore_skb; case NEXTHDR_ROUTING: case NEXTHDR_HOP: case NEXTHDR_DEST: if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff)) goto fail; ip6xh = (void *)(skb_network_header(skb) + hl); ixhl = ipv6_optlen(ip6xh); if (!pskb_may_pull(skb, hl + ixhl + ntkoff)) goto fail; ip6xh = (void *)(skb_network_header(skb) + hl); if ((nexthdr == NEXTHDR_HOP) && !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl))) goto fail; nexthdr = ip6xh->nexthdr; hl += ixhl; break; case IPPROTO_ICMPV6: if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) if (!tcf_csum_ipv6_icmp(skb, hl, pl + sizeof(*ip6h))) goto fail; goto done; case IPPROTO_TCP: if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) if (!tcf_csum_ipv6_tcp(skb, hl, pl + sizeof(*ip6h))) goto fail; goto done; case IPPROTO_UDP: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) if (!tcf_csum_ipv6_udp(skb, hl, pl + sizeof(*ip6h), 0)) goto fail; goto done; case IPPROTO_UDPLITE: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) if (!tcf_csum_ipv6_udp(skb, hl, pl + sizeof(*ip6h), 1)) goto fail; goto done; case IPPROTO_SCTP: if ((update_flags & TCA_CSUM_UPDATE_FLAG_SCTP) && !tcf_csum_sctp(skb, hl, pl + sizeof(*ip6h))) goto fail; goto done; default: goto ignore_skb; } } while (pskb_may_pull(skb, hl + 1 + ntkoff)); done: ignore_skb: return 1; fail: return 0; } TC_INDIRECT_SCOPE int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_csum *p = to_tcf_csum(a); bool orig_vlan_tag_present = false; unsigned int vlan_hdr_count = 0; struct tcf_csum_params *params; u32 update_flags; __be16 protocol; int action; params = rcu_dereference_bh(p->params); tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); action = READ_ONCE(p->tcf_action); if (unlikely(action == TC_ACT_SHOT)) goto drop; update_flags = params->update_flags; protocol = skb_protocol(skb, false); again: switch (protocol) { case cpu_to_be16(ETH_P_IP): if (!tcf_csum_ipv4(skb, update_flags)) goto drop; break; case cpu_to_be16(ETH_P_IPV6): if (!tcf_csum_ipv6(skb, update_flags)) goto drop; break; case cpu_to_be16(ETH_P_8021AD): fallthrough; case cpu_to_be16(ETH_P_8021Q): if (skb_vlan_tag_present(skb) && !orig_vlan_tag_present) { protocol = skb->protocol; orig_vlan_tag_present = true; } else { struct vlan_hdr *vlan = (struct vlan_hdr *)skb->data; protocol = vlan->h_vlan_encapsulated_proto; skb_pull(skb, VLAN_HLEN); skb_reset_network_header(skb); vlan_hdr_count++; } goto again; } out: /* Restore the skb for the pulled VLAN tags */ while (vlan_hdr_count--) { skb_push(skb, VLAN_HLEN); skb_reset_network_header(skb); } return action; drop: tcf_action_inc_drop_qstats(&p->common); action = TC_ACT_SHOT; goto out; } static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_csum *p = to_tcf_csum(a); struct tcf_csum_params *params; struct tc_csum opt = { .index = p->tcf_index, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&p->tcf_lock); params = rcu_dereference_protected(p->params, lockdep_is_held(&p->tcf_lock)); opt.action = p->tcf_action; opt.update_flags = params->update_flags; if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; spin_unlock_bh(&p->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_csum_cleanup(struct tc_action *a) { struct tcf_csum *p = to_tcf_csum(a); struct tcf_csum_params *params; params = rcu_dereference_protected(p->params, 1); if (params) kfree_rcu(params, rcu); } static size_t tcf_csum_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_csum)); } static int tcf_csum_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; entry->id = FLOW_ACTION_CSUM; entry->csum_flags = tcf_csum_update_flags(act); *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; fl_action->id = FLOW_ACTION_CSUM; } return 0; } static struct tc_action_ops act_csum_ops = { .kind = "csum", .id = TCA_ID_CSUM, .owner = THIS_MODULE, .act = tcf_csum_act, .dump = tcf_csum_dump, .init = tcf_csum_init, .cleanup = tcf_csum_cleanup, .get_fill_size = tcf_csum_get_fill_size, .offload_act_setup = tcf_csum_offload_act_setup, .size = sizeof(struct tcf_csum), }; MODULE_ALIAS_NET_ACT("csum"); static __net_init int csum_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_csum_ops.net_id); return tc_action_net_init(net, tn, &act_csum_ops); } static void __net_exit csum_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_csum_ops.net_id); } static struct pernet_operations csum_net_ops = { .init = csum_init_net, .exit_batch = csum_exit_net, .id = &act_csum_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_DESCRIPTION("Checksum updating actions"); MODULE_LICENSE("GPL"); static int __init csum_init_module(void) { return tcf_register_action(&act_csum_ops, &csum_net_ops); } static void __exit csum_cleanup_module(void) { tcf_unregister_action(&act_csum_ops, &csum_net_ops); } module_init(csum_init_module); module_exit(csum_cleanup_module); |
82 83 77 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMENS_H #define _LINUX_TIMENS_H #include <linux/sched.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/err.h> #include <linux/time64.h> struct user_namespace; extern struct user_namespace init_user_ns; struct vm_area_struct; struct timens_offsets { struct timespec64 monotonic; struct timespec64 boottime; }; struct time_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; struct ns_common ns; struct timens_offsets offsets; struct page *vvar_page; /* If set prevents changing offsets after any task joined namespace. */ bool frozen_offsets; } __randomize_layout; extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { refcount_inc(&ns->ns.count); return ns; } struct time_namespace *copy_time_ns(unsigned long flags, struct user_namespace *user_ns, struct time_namespace *old_ns); void free_time_ns(struct time_namespace *ns); void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); struct page *find_timens_vvar_page(struct vm_area_struct *vma); static inline void put_time_ns(struct time_namespace *ns) { if (refcount_dec_and_test(&ns->ns.count)) free_time_ns(ns); } void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m); struct proc_timens_offset { int clockid; struct timespec64 val; }; int proc_timens_set_offset(struct file *file, struct task_struct *p, struct proc_timens_offset *offsets, int n); static inline void timens_add_monotonic(struct timespec64 *ts) { struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; *ts = timespec64_add(*ts, ns_offsets->monotonic); } static inline void timens_add_boottime(struct timespec64 *ts) { struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; *ts = timespec64_add(*ts, ns_offsets->boottime); } static inline u64 timens_add_boottime_ns(u64 nsec) { struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; return nsec + timespec64_to_ns(&ns_offsets->boottime); } static inline void timens_sub_boottime(struct timespec64 *ts) { struct timens_offsets *ns_offsets = ¤t->nsproxy->time_ns->offsets; *ts = timespec64_sub(*ts, ns_offsets->boottime); } ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *offsets); static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) { struct time_namespace *ns = current->nsproxy->time_ns; if (likely(ns == &init_time_ns)) return tim; return do_timens_ktime_to_host(clockid, tim, &ns->offsets); } #else static inline int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { return 0; } static inline void timens_commit(struct task_struct *tsk, struct time_namespace *ns) { } static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { return NULL; } static inline void put_time_ns(struct time_namespace *ns) { } static inline struct time_namespace *copy_time_ns(unsigned long flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (flags & CLONE_NEWTIME) return ERR_PTR(-EINVAL); return old_ns; } static inline void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { return; } static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) { return NULL; } static inline void timens_add_monotonic(struct timespec64 *ts) { } static inline void timens_add_boottime(struct timespec64 *ts) { } static inline u64 timens_add_boottime_ns(u64 nsec) { return nsec; } static inline void timens_sub_boottime(struct timespec64 *ts) { } static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) { return tim; } #endif struct vdso_data *arch_get_vdso_data(void *vvar_page); #endif /* _LINUX_TIMENS_H */ |
5 1 1 1 2 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | // SPDX-License-Identifier: GPL-2.0-only /* IP tables module for matching IPsec policy * * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/init.h> #include <net/xfrm.h> #include <linux/netfilter.h> #include <linux/netfilter/xt_policy.h> #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: IPsec policy match"); MODULE_LICENSE("GPL"); static inline bool xt_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *m, const union nf_inet_addr *a2, unsigned short family) { switch (family) { case NFPROTO_IPV4: return ((a1->ip ^ a2->ip) & m->ip) == 0; case NFPROTO_IPV6: return ipv6_masked_addr_cmp(&a1->in6, &m->in6, &a2->in6) == 0; } return false; } static bool match_xfrm_state(const struct xfrm_state *x, const struct xt_policy_elem *e, unsigned short family) { #define MATCH_ADDR(x,y,z) (!e->match.x || \ (xt_addr_cmp(&e->x, &e->y, (const union nf_inet_addr *)(z), family) \ ^ e->invert.x)) #define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x)) return MATCH_ADDR(saddr, smask, &x->props.saddr) && MATCH_ADDR(daddr, dmask, &x->id.daddr) && MATCH(proto, x->id.proto) && MATCH(mode, x->props.mode) && MATCH(spi, x->id.spi) && MATCH(reqid, x->props.reqid); } static int match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info, unsigned short family) { const struct xt_policy_elem *e; const struct sec_path *sp = skb_sec_path(skb); int strict = info->flags & XT_POLICY_MATCH_STRICT; int i, pos; if (sp == NULL) return -1; if (strict && info->len != sp->len) return 0; for (i = sp->len - 1; i >= 0; i--) { pos = strict ? i - sp->len + 1 : 0; if (pos >= info->len) return 0; e = &info->pol[pos]; if (match_xfrm_state(sp->xvec[i], e, family)) { if (!strict) return 1; } else if (strict) return 0; } return strict ? 1 : 0; } static int match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, unsigned short family) { const struct xt_policy_elem *e; const struct dst_entry *dst = skb_dst(skb); int strict = info->flags & XT_POLICY_MATCH_STRICT; int i, pos; if (dst->xfrm == NULL) return -1; for (i = 0; dst && dst->xfrm; dst = ((struct xfrm_dst *)dst)->child, i++) { pos = strict ? i : 0; if (pos >= info->len) return 0; e = &info->pol[pos]; if (match_xfrm_state(dst->xfrm, e, family)) { if (!strict) return 1; } else if (strict) return 0; } return strict ? i == info->len : 0; } static bool policy_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_policy_info *info = par->matchinfo; int ret; if (info->flags & XT_POLICY_MATCH_IN) ret = match_policy_in(skb, info, xt_family(par)); else ret = match_policy_out(skb, info, xt_family(par)); if (ret < 0) ret = info->flags & XT_POLICY_MATCH_NONE ? true : false; else if (info->flags & XT_POLICY_MATCH_NONE) ret = false; return ret; } static int policy_mt_check(const struct xt_mtchk_param *par) { const struct xt_policy_info *info = par->matchinfo; const char *errmsg = "neither incoming nor outgoing policy selected"; if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) goto err; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { errmsg = "output policy not valid in PREROUTING and INPUT"; goto err; } if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_POLICY_MATCH_IN) { errmsg = "input policy not valid in POSTROUTING and OUTPUT"; goto err; } if (info->len > XT_POLICY_MAX_ELEM) { errmsg = "too many policy elements"; goto err; } return 0; err: pr_info_ratelimited("%s\n", errmsg); return -EINVAL; } static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", .family = NFPROTO_IPV4, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), .me = THIS_MODULE, }, { .name = "policy", .family = NFPROTO_IPV6, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), .me = THIS_MODULE, }, }; static int __init policy_mt_init(void) { return xt_register_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); } static void __exit policy_mt_exit(void) { xt_unregister_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); } module_init(policy_mt_init); module_exit(policy_mt_exit); MODULE_ALIAS("ipt_policy"); MODULE_ALIAS("ip6t_policy"); |
1 5 2 10 10 2 8 8 8 10 10 10 10 10 10 1 5 1 2 1 2 1 4 10 10 1 1 1 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/cls_route.c ROUTE4 classifier. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/dst.h> #include <net/route.h> #include <net/netlink.h> #include <net/act_api.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> /* * 1. For now we assume that route tags < 256. * It allows to use direct table lookups, instead of hash tables. * 2. For now we assume that "from TAG" and "fromdev DEV" statements * are mutually exclusive. * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" */ struct route4_fastmap { struct route4_filter *filter; u32 id; int iif; }; struct route4_head { struct route4_fastmap fastmap[16]; struct route4_bucket __rcu *table[256 + 1]; struct rcu_head rcu; }; struct route4_bucket { /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */ struct route4_filter __rcu *ht[16 + 16 + 1]; struct rcu_head rcu; }; struct route4_filter { struct route4_filter __rcu *next; u32 id; int iif; struct tcf_result res; struct tcf_exts exts; u32 handle; struct route4_bucket *bkt; struct tcf_proto *tp; struct rcu_work rwork; }; #define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) static inline int route4_fastmap_hash(u32 id, int iif) { return id & 0xF; } static DEFINE_SPINLOCK(fastmap_lock); static void route4_reset_fastmap(struct route4_head *head) { spin_lock_bh(&fastmap_lock); memset(head->fastmap, 0, sizeof(head->fastmap)); spin_unlock_bh(&fastmap_lock); } static void route4_set_fastmap(struct route4_head *head, u32 id, int iif, struct route4_filter *f) { int h = route4_fastmap_hash(id, iif); /* fastmap updates must look atomic to aling id, iff, filter */ spin_lock_bh(&fastmap_lock); head->fastmap[h].id = id; head->fastmap[h].iif = iif; head->fastmap[h].filter = f; spin_unlock_bh(&fastmap_lock); } static inline int route4_hash_to(u32 id) { return id & 0xFF; } static inline int route4_hash_from(u32 id) { return (id >> 16) & 0xF; } static inline int route4_hash_iif(int iif) { return 16 + ((iif >> 16) & 0xF); } static inline int route4_hash_wild(void) { return 32; } #define ROUTE4_APPLY_RESULT() \ { \ *res = f->res; \ if (tcf_exts_has_actions(&f->exts)) { \ int r = tcf_exts_exec(skb, &f->exts, res); \ if (r < 0) { \ dont_cache = 1; \ continue; \ } \ return r; \ } else if (!dont_cache) \ route4_set_fastmap(head, id, iif, f); \ return 0; \ } TC_INDIRECT_SCOPE int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct route4_head *head = rcu_dereference_bh(tp->root); struct dst_entry *dst; struct route4_bucket *b; struct route4_filter *f; u32 id, h; int iif, dont_cache = 0; dst = skb_dst(skb); if (!dst) goto failure; id = dst->tclassid; iif = inet_iif(skb); h = route4_fastmap_hash(id, iif); spin_lock(&fastmap_lock); if (id == head->fastmap[h].id && iif == head->fastmap[h].iif && (f = head->fastmap[h].filter) != NULL) { if (f == ROUTE4_FAILURE) { spin_unlock(&fastmap_lock); goto failure; } *res = f->res; spin_unlock(&fastmap_lock); return 0; } spin_unlock(&fastmap_lock); h = route4_hash_to(id); restart: b = rcu_dereference_bh(head->table[h]); if (b) { for (f = rcu_dereference_bh(b->ht[route4_hash_from(id)]); f; f = rcu_dereference_bh(f->next)) if (f->id == id) ROUTE4_APPLY_RESULT(); for (f = rcu_dereference_bh(b->ht[route4_hash_iif(iif)]); f; f = rcu_dereference_bh(f->next)) if (f->iif == iif) ROUTE4_APPLY_RESULT(); for (f = rcu_dereference_bh(b->ht[route4_hash_wild()]); f; f = rcu_dereference_bh(f->next)) ROUTE4_APPLY_RESULT(); } if (h < 256) { h = 256; id &= ~0xFFFF; goto restart; } if (!dont_cache) route4_set_fastmap(head, id, iif, ROUTE4_FAILURE); failure: return -1; } static inline u32 to_hash(u32 id) { u32 h = id & 0xFF; if (id & 0x8000) h += 256; return h; } static inline u32 from_hash(u32 id) { id &= 0xFFFF; if (id == 0xFFFF) return 32; if (!(id & 0x8000)) { if (id > 255) return 256; return id & 0xF; } return 16 + (id & 0xF); } static void *route4_get(struct tcf_proto *tp, u32 handle) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_bucket *b; struct route4_filter *f; unsigned int h1, h2; h1 = to_hash(handle); if (h1 > 256) return NULL; h2 = from_hash(handle >> 16); if (h2 > 32) return NULL; b = rtnl_dereference(head->table[h1]); if (b) { for (f = rtnl_dereference(b->ht[h2]); f; f = rtnl_dereference(f->next)) if (f->handle == handle) return f; } return NULL; } static int route4_init(struct tcf_proto *tp) { struct route4_head *head; head = kzalloc(sizeof(struct route4_head), GFP_KERNEL); if (head == NULL) return -ENOBUFS; rcu_assign_pointer(tp->root, head); return 0; } static void __route4_delete_filter(struct route4_filter *f) { tcf_exts_destroy(&f->exts); tcf_exts_put_net(&f->exts); kfree(f); } static void route4_delete_filter_work(struct work_struct *work) { struct route4_filter *f = container_of(to_rcu_work(work), struct route4_filter, rwork); rtnl_lock(); __route4_delete_filter(f); rtnl_unlock(); } static void route4_queue_work(struct route4_filter *f) { tcf_queue_work(&f->rwork, route4_delete_filter_work); } static void route4_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); int h1, h2; if (head == NULL) return; for (h1 = 0; h1 <= 256; h1++) { struct route4_bucket *b; b = rtnl_dereference(head->table[h1]); if (b) { for (h2 = 0; h2 <= 32; h2++) { struct route4_filter *f; while ((f = rtnl_dereference(b->ht[h2])) != NULL) { struct route4_filter *next; next = rtnl_dereference(f->next); RCU_INIT_POINTER(b->ht[h2], next); tcf_unbind_filter(tp, &f->res); if (tcf_exts_get_net(&f->exts)) route4_queue_work(f); else __route4_delete_filter(f); } } RCU_INIT_POINTER(head->table[h1], NULL); kfree_rcu(b, rcu); } } kfree_rcu(head, rcu); } static int route4_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter *f = arg; struct route4_filter __rcu **fp; struct route4_filter *nf; struct route4_bucket *b; unsigned int h = 0; int i, h1; if (!head || !f) return -EINVAL; h = f->handle; b = f->bkt; fp = &b->ht[from_hash(h >> 16)]; for (nf = rtnl_dereference(*fp); nf; fp = &nf->next, nf = rtnl_dereference(*fp)) { if (nf == f) { /* unlink it */ RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); /* Remove any fastmap lookups that might ref filter * notice we unlink'd the filter so we can't get it * back in the fastmap. */ route4_reset_fastmap(head); /* Delete it */ tcf_unbind_filter(tp, &f->res); tcf_exts_get_net(&f->exts); tcf_queue_work(&f->rwork, route4_delete_filter_work); /* Strip RTNL protected tree */ for (i = 0; i <= 32; i++) { struct route4_filter *rt; rt = rtnl_dereference(b->ht[i]); if (rt) goto out; } /* OK, session has no flows */ RCU_INIT_POINTER(head->table[to_hash(h)], NULL); kfree_rcu(b, rcu); break; } } out: *last = true; for (h1 = 0; h1 <= 256; h1++) { if (rcu_access_pointer(head->table[h1])) { *last = false; break; } } return 0; } static const struct nla_policy route4_policy[TCA_ROUTE4_MAX + 1] = { [TCA_ROUTE4_CLASSID] = { .type = NLA_U32 }, [TCA_ROUTE4_TO] = NLA_POLICY_MAX(NLA_U32, 0xFF), [TCA_ROUTE4_FROM] = NLA_POLICY_MAX(NLA_U32, 0xFF), [TCA_ROUTE4_IIF] = NLA_POLICY_MAX(NLA_U32, 0x7FFF), }; static int route4_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct route4_filter *f, u32 handle, struct route4_head *head, struct nlattr **tb, struct nlattr *est, int new, u32 flags, struct netlink_ext_ack *extack) { u32 id = 0, to = 0, nhandle = 0x8000; struct route4_filter *fp; unsigned int h1; struct route4_bucket *b; int err; err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack); if (err < 0) return err; if (tb[TCA_ROUTE4_TO]) { if (new && handle & 0x8000) { NL_SET_ERR_MSG(extack, "Invalid handle"); return -EINVAL; } to = nla_get_u32(tb[TCA_ROUTE4_TO]); nhandle = to; } if (tb[TCA_ROUTE4_FROM] && tb[TCA_ROUTE4_IIF]) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_ROUTE4_FROM], "'from' and 'fromif' are mutually exclusive"); return -EINVAL; } if (tb[TCA_ROUTE4_FROM]) { id = nla_get_u32(tb[TCA_ROUTE4_FROM]); nhandle |= id << 16; } else if (tb[TCA_ROUTE4_IIF]) { id = nla_get_u32(tb[TCA_ROUTE4_IIF]); nhandle |= (id | 0x8000) << 16; } else nhandle |= 0xFFFF << 16; if (handle && new) { nhandle |= handle & 0x7F00; if (nhandle != handle) { NL_SET_ERR_MSG_FMT(extack, "Handle mismatch constructed: %x (expected: %x)", handle, nhandle); return -EINVAL; } } if (!nhandle) { NL_SET_ERR_MSG(extack, "Replacing with handle of 0 is invalid"); return -EINVAL; } h1 = to_hash(nhandle); b = rtnl_dereference(head->table[h1]); if (!b) { b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL); if (b == NULL) return -ENOBUFS; rcu_assign_pointer(head->table[h1], b); } else { unsigned int h2 = from_hash(nhandle >> 16); for (fp = rtnl_dereference(b->ht[h2]); fp; fp = rtnl_dereference(fp->next)) if (fp->handle == f->handle) return -EEXIST; } if (tb[TCA_ROUTE4_TO]) f->id = to; if (tb[TCA_ROUTE4_FROM]) f->id = to | id<<16; else if (tb[TCA_ROUTE4_IIF]) f->iif = id; f->handle = nhandle; f->bkt = b; f->tp = tp; if (tb[TCA_ROUTE4_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_ROUTE4_CLASSID]); tcf_bind_filter(tp, &f->res, base); } return 0; } static int route4_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter __rcu **fp; struct route4_filter *fold, *f1, *pfp, *f = NULL; struct route4_bucket *b; struct nlattr *tb[TCA_ROUTE4_MAX + 1]; unsigned int h, th; int err; bool new = true; if (!handle) { NL_SET_ERR_MSG(extack, "Creating with handle of 0 is invalid"); return -EINVAL; } if (NL_REQ_ATTR_CHECK(extack, NULL, tca, TCA_OPTIONS)) { NL_SET_ERR_MSG_MOD(extack, "Missing options"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_ROUTE4_MAX, tca[TCA_OPTIONS], route4_policy, NULL); if (err < 0) return err; fold = *arg; if (fold && fold->handle != handle) return -EINVAL; err = -ENOBUFS; f = kzalloc(sizeof(struct route4_filter), GFP_KERNEL); if (!f) goto errout; err = tcf_exts_init(&f->exts, net, TCA_ROUTE4_ACT, TCA_ROUTE4_POLICE); if (err < 0) goto errout; if (fold) { f->id = fold->id; f->iif = fold->iif; f->handle = fold->handle; f->tp = fold->tp; f->bkt = fold->bkt; new = false; } err = route4_set_parms(net, tp, base, f, handle, head, tb, tca[TCA_RATE], new, flags, extack); if (err < 0) goto errout; h = from_hash(f->handle >> 16); fp = &f->bkt->ht[h]; for (pfp = rtnl_dereference(*fp); (f1 = rtnl_dereference(*fp)) != NULL; fp = &f1->next) if (f->handle < f1->handle) break; tcf_block_netif_keep_dst(tp->chain->block); rcu_assign_pointer(f->next, f1); rcu_assign_pointer(*fp, f); if (fold) { th = to_hash(fold->handle); h = from_hash(fold->handle >> 16); b = rtnl_dereference(head->table[th]); if (b) { fp = &b->ht[h]; for (pfp = rtnl_dereference(*fp); pfp; fp = &pfp->next, pfp = rtnl_dereference(*fp)) { if (pfp == fold) { rcu_assign_pointer(*fp, fold->next); break; } } } } route4_reset_fastmap(head); *arg = f; if (fold) { tcf_unbind_filter(tp, &fold->res); tcf_exts_get_net(&fold->exts); tcf_queue_work(&fold->rwork, route4_delete_filter_work); } return 0; errout: if (f) tcf_exts_destroy(&f->exts); kfree(f); return err; } static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct route4_head *head = rtnl_dereference(tp->root); unsigned int h, h1; if (head == NULL || arg->stop) return; for (h = 0; h <= 256; h++) { struct route4_bucket *b = rtnl_dereference(head->table[h]); if (b) { for (h1 = 0; h1 <= 32; h1++) { struct route4_filter *f; for (f = rtnl_dereference(b->ht[h1]); f; f = rtnl_dereference(f->next)) { if (!tc_cls_stats_dump(tp, arg, f)) return; } } } } } static int route4_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) { struct route4_filter *f = fh; struct nlattr *nest; u32 id; if (f == NULL) return skb->len; t->tcm_handle = f->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (!(f->handle & 0x8000)) { id = f->id & 0xFF; if (nla_put_u32(skb, TCA_ROUTE4_TO, id)) goto nla_put_failure; } if (f->handle & 0x80000000) { if ((f->handle >> 16) != 0xFFFF && nla_put_u32(skb, TCA_ROUTE4_IIF, f->iif)) goto nla_put_failure; } else { id = f->id >> 16; if (nla_put_u32(skb, TCA_ROUTE4_FROM, id)) goto nla_put_failure; } if (f->res.classid && nla_put_u32(skb, TCA_ROUTE4_CLASSID, f->res.classid)) goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts) < 0) goto nla_put_failure; nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static void route4_bind_class(void *fh, u32 classid, unsigned long cl, void *q, unsigned long base) { struct route4_filter *f = fh; tc_cls_bind_class(classid, cl, q, &f->res, base); } static struct tcf_proto_ops cls_route4_ops __read_mostly = { .kind = "route", .classify = route4_classify, .init = route4_init, .destroy = route4_destroy, .get = route4_get, .change = route4_change, .delete = route4_delete, .walk = route4_walk, .dump = route4_dump, .bind_class = route4_bind_class, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_CLS("route"); static int __init init_route4(void) { return register_tcf_proto_ops(&cls_route4_ops); } static void __exit exit_route4(void) { unregister_tcf_proto_ops(&cls_route4_ops); } module_init(init_route4) module_exit(exit_route4) MODULE_DESCRIPTION("Routing table realm based TC classifier"); MODULE_LICENSE("GPL"); |
11 13 24 30 30 24 19 29 11 11 11 11 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | /* * llc_core.c - Minimum needed routines for sap handling and module init/exit * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/init.h> #include <net/net_namespace.h> #include <net/llc.h> LIST_HEAD(llc_sap_list); static DEFINE_SPINLOCK(llc_sap_list_lock); /** * llc_sap_alloc - allocates and initializes sap. * * Allocates and initializes sap. */ static struct llc_sap *llc_sap_alloc(void) { struct llc_sap *sap = kzalloc(sizeof(*sap), GFP_ATOMIC); int i; if (sap) { /* sap->laddr.mac - leave as a null, it's filled by bind */ sap->state = LLC_SAP_STATE_ACTIVE; spin_lock_init(&sap->sk_lock); for (i = 0; i < LLC_SK_LADDR_HASH_ENTRIES; i++) INIT_HLIST_NULLS_HEAD(&sap->sk_laddr_hash[i], i); refcount_set(&sap->refcnt, 1); } return sap; } static struct llc_sap *__llc_sap_find(unsigned char sap_value) { struct llc_sap *sap; list_for_each_entry(sap, &llc_sap_list, node) if (sap->laddr.lsap == sap_value) goto out; sap = NULL; out: return sap; } /** * llc_sap_find - searches a SAP in station * @sap_value: sap to be found * * Searches for a sap in the sap list of the LLC's station upon the sap ID. * If the sap is found it will be refcounted and the user will have to do * a llc_sap_put after use. * Returns the sap or %NULL if not found. */ struct llc_sap *llc_sap_find(unsigned char sap_value) { struct llc_sap *sap; rcu_read_lock_bh(); sap = __llc_sap_find(sap_value); if (!sap || !llc_sap_hold_safe(sap)) sap = NULL; rcu_read_unlock_bh(); return sap; } /** * llc_sap_open - open interface to the upper layers. * @lsap: SAP number. * @func: rcv func for datalink protos * * Interface function to upper layer. Each one who wants to get a SAP * (for example NetBEUI) should call this function. Returns the opened * SAP for success, NULL for failure. */ struct llc_sap *llc_sap_open(unsigned char lsap, int (*func)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)) { struct llc_sap *sap = NULL; spin_lock_bh(&llc_sap_list_lock); if (__llc_sap_find(lsap)) /* SAP already exists */ goto out; sap = llc_sap_alloc(); if (!sap) goto out; sap->laddr.lsap = lsap; sap->rcv_func = func; list_add_tail_rcu(&sap->node, &llc_sap_list); out: spin_unlock_bh(&llc_sap_list_lock); return sap; } /** * llc_sap_close - close interface for upper layers. * @sap: SAP to be closed. * * Close interface function to upper layer. Each one who wants to * close an open SAP (for example NetBEUI) should call this function. * Removes this sap from the list of saps in the station and then * frees the memory for this sap. */ void llc_sap_close(struct llc_sap *sap) { WARN_ON(sap->sk_count); spin_lock_bh(&llc_sap_list_lock); list_del_rcu(&sap->node); spin_unlock_bh(&llc_sap_list_lock); kfree_rcu(sap, rcu); } static struct packet_type llc_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_802_2), .func = llc_rcv, }; static int __init llc_init(void) { dev_add_pack(&llc_packet_type); return 0; } static void __exit llc_exit(void) { dev_remove_pack(&llc_packet_type); } module_init(llc_init); module_exit(llc_exit); EXPORT_SYMBOL(llc_sap_list); EXPORT_SYMBOL(llc_sap_find); EXPORT_SYMBOL(llc_sap_open); EXPORT_SYMBOL(llc_sap_close); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); MODULE_DESCRIPTION("LLC IEEE 802.2 core support"); |
11 24 26 2 4 1 1 67 31 302 28 12 691 432 1032 1039 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NF_TABLES_H #define _NET_NF_TABLES_H #include <linux/unaligned.h> #include <linux/list.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_tables.h> #include <linux/u64_stats_sync.h> #include <linux/rhashtable.h> #include <net/netfilter/nf_flow_table.h> #include <net/netlink.h> #include <net/flow_offload.h> #include <net/netns/generic.h> #define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) struct module; #define NFT_JUMP_STACK_SIZE 16 enum { NFT_PKTINFO_L4PROTO = (1 << 0), NFT_PKTINFO_INNER = (1 << 1), NFT_PKTINFO_INNER_FULL = (1 << 2), }; struct nft_pktinfo { struct sk_buff *skb; const struct nf_hook_state *state; u8 flags; u8 tprot; u16 fragoff; u16 thoff; u16 inneroff; }; static inline struct sock *nft_sk(const struct nft_pktinfo *pkt) { return pkt->state->sk; } static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt) { return pkt->thoff; } static inline struct net *nft_net(const struct nft_pktinfo *pkt) { return pkt->state->net; } static inline unsigned int nft_hook(const struct nft_pktinfo *pkt) { return pkt->state->hook; } static inline u8 nft_pf(const struct nft_pktinfo *pkt) { return pkt->state->pf; } static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt) { return pkt->state->in; } static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt) { return pkt->state->out; } static inline void nft_set_pktinfo(struct nft_pktinfo *pkt, struct sk_buff *skb, const struct nf_hook_state *state) { pkt->skb = skb; pkt->state = state; } static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt) { pkt->flags = 0; pkt->tprot = 0; pkt->thoff = 0; pkt->fragoff = 0; } /** * struct nft_verdict - nf_tables verdict * * @code: nf_tables/netfilter verdict code * @chain: destination chain for NFT_JUMP/NFT_GOTO */ struct nft_verdict { u32 code; struct nft_chain *chain; }; struct nft_data { union { u32 data[4]; struct nft_verdict verdict; }; } __attribute__((aligned(__alignof__(u64)))); #define NFT_REG32_NUM 20 /** * struct nft_regs - nf_tables register set * * @data: data registers * @verdict: verdict register * * The first four data registers alias to the verdict register. */ struct nft_regs { union { u32 data[NFT_REG32_NUM]; struct nft_verdict verdict; }; }; struct nft_regs_track { struct { const struct nft_expr *selector; const struct nft_expr *bitwise; u8 num_reg; } regs[NFT_REG32_NUM]; const struct nft_expr *cur; const struct nft_expr *last; }; /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit * level. So for store instruction, pad the rest part with zero to avoid * garbage values. */ static inline void nft_reg_store8(u32 *dreg, u8 val) { *dreg = 0; *(u8 *)dreg = val; } static inline u8 nft_reg_load8(const u32 *sreg) { return *(u8 *)sreg; } static inline void nft_reg_store16(u32 *dreg, u16 val) { *dreg = 0; *(u16 *)dreg = val; } static inline void nft_reg_store_be16(u32 *dreg, __be16 val) { nft_reg_store16(dreg, (__force __u16)val); } static inline u16 nft_reg_load16(const u32 *sreg) { return *(u16 *)sreg; } static inline __be16 nft_reg_load_be16(const u32 *sreg) { return (__force __be16)nft_reg_load16(sreg); } static inline __be32 nft_reg_load_be32(const u32 *sreg) { return *(__force __be32 *)sreg; } static inline void nft_reg_store64(u64 *dreg, u64 val) { put_unaligned(val, dreg); } static inline u64 nft_reg_load64(const u32 *sreg) { return get_unaligned((u64 *)sreg); } static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { if (len % NFT_REG32_SIZE) dst[len / NFT_REG32_SIZE] = 0; memcpy(dst, src, len); } /** * struct nft_ctx - nf_tables rule/set context * * @net: net namespace * @table: the table the chain is contained in * @chain: the chain the rule is contained in * @nla: netlink attributes * @portid: netlink portID of the original message * @seq: netlink sequence number * @flags: modifiers to new request * @family: protocol family * @level: depth of the chains * @report: notify via unicast netlink message * @reg_inited: bitmap of initialised registers */ struct nft_ctx { struct net *net; struct nft_table *table; struct nft_chain *chain; const struct nlattr * const *nla; u32 portid; u32 seq; u16 flags; u8 family; u8 level; bool report; DECLARE_BITMAP(reg_inited, NFT_REG32_NUM); }; enum nft_data_desc_flags { NFT_DATA_DESC_SETELEM = (1 << 0), }; struct nft_data_desc { enum nft_data_types type; unsigned int size; unsigned int len; unsigned int flags; }; int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_data_desc *desc, const struct nlattr *nla); void nft_data_hold(const struct nft_data *data, enum nft_data_types type); void nft_data_release(const struct nft_data *data, enum nft_data_types type); int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, enum nft_data_types type, unsigned int len); static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg) { return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) { return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE; } int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); int nft_parse_register_load(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *sreg, u32 len); int nft_parse_register_store(const struct nft_ctx *ctx, const struct nlattr *attr, u8 *dreg, const struct nft_data *data, enum nft_data_types type, unsigned int len); /** * struct nft_userdata - user defined data associated with an object * * @len: length of the data * @data: content * * The presence of user data is indicated in an object specific fashion, * so a length of zero can't occur and the value "len" indicates data * of length len + 1. */ struct nft_userdata { u8 len; unsigned char data[]; }; /* placeholder structure for opaque set element backend representation. */ struct nft_elem_priv { }; /** * struct nft_set_elem - generic representation of set elements * * @key: element key * @key_end: closing element key * @data: element data * @priv: element private data and extensions */ struct nft_set_elem { union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } key_end; union { u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)]; struct nft_data val; } data; struct nft_elem_priv *priv; }; static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) { return (void *)priv; } /** * enum nft_iter_type - nftables set iterator type * * @NFT_ITER_UNSPEC: unspecified, to catch errors * @NFT_ITER_READ: read-only iteration over set elements * @NFT_ITER_UPDATE: iteration under mutex to update set element state */ enum nft_iter_type { NFT_ITER_UNSPEC, NFT_ITER_READ, NFT_ITER_UPDATE, }; struct nft_set; struct nft_set_iter { u8 genmask; enum nft_iter_type type:8; unsigned int count; unsigned int skip; int err; int (*fn)(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); }; /** * struct nft_set_desc - description of set elements * * @ktype: key type * @klen: key length * @dtype: data type * @dlen: data length * @objtype: object type * @size: number of set elements * @policy: set policy * @gc_int: garbage collector interval * @timeout: element timeout * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @expr: set must support for expressions */ struct nft_set_desc { u32 ktype; unsigned int klen; u32 dtype; unsigned int dlen; u32 objtype; unsigned int size; u32 policy; u32 gc_int; u64 timeout; u8 field_len[NFT_REG32_COUNT]; u8 field_count; bool expr; }; /** * enum nft_set_class - performance class * * @NFT_SET_CLASS_O_1: constant, O(1) * @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N) * @NFT_SET_CLASS_O_N: linear, O(N) */ enum nft_set_class { NFT_SET_CLASS_O_1, NFT_SET_CLASS_O_LOG_N, NFT_SET_CLASS_O_N, }; /** * struct nft_set_estimate - estimation of memory and performance * characteristics * * @size: required memory * @lookup: lookup performance class * @space: memory class */ struct nft_set_estimate { u64 size; enum nft_set_class lookup; enum nft_set_class space; }; #define NFT_EXPR_MAXATTR 16 #define NFT_EXPR_SIZE(size) (sizeof(struct nft_expr) + \ ALIGN(size, __alignof__(struct nft_expr))) /** * struct nft_expr - nf_tables expression * * @ops: expression ops * @data: expression private data */ struct nft_expr { const struct nft_expr_ops *ops; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_expr_priv(const struct nft_expr *expr) { return (void *)expr->data; } struct nft_expr_info; int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info); int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); bool nft_expr_reduce_bitwise(struct nft_regs_track *track, const struct nft_expr *expr); struct nft_set_ext; /** * struct nft_set_ops - nf_tables set operations * * @lookup: look up an element within the set * @update: update an element if exists, add it if doesn't exist * @delete: delete an element * @insert: insert new element into set * @activate: activate new element in the next generation * @deactivate: lookup for element and deactivate it in the next generation * @flush: deactivate element in the next generation * @remove: remove element from set * @walk: iterate over all set elements * @get: get set elements * @commit: commit set elements * @abort: abort set elements * @privsize: function to return size of set private data * @estimate: estimate the required memory size and the lookup complexity class * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @gc_init: initialize garbage collection * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster * and currently only used in the packet path. All the rest are slower, * control plane functions. */ struct nft_set_ops { bool (*lookup)(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext); bool (*update)(struct nft_set *set, const u32 *key, struct nft_elem_priv * (*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *), const struct nft_expr *expr, struct nft_regs *regs, const struct nft_set_ext **ext); bool (*delete)(const struct nft_set *set, const u32 *key); int (*insert)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **priv); void (*activate)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); struct nft_elem_priv * (*deactivate)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem); void (*flush)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *priv); void (*remove)(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); void (*walk)(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter); struct nft_elem_priv * (*get)(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags); void (*commit)(struct nft_set *set); void (*abort)(const struct nft_set *set); u64 (*privsize)(const struct nlattr * const nla[], const struct nft_set_desc *desc); bool (*estimate)(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); int (*init)(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]); void (*destroy)(const struct nft_ctx *ctx, const struct nft_set *set); void (*gc_init)(const struct nft_set *set); unsigned int elemsize; }; /** * struct nft_set_type - nf_tables set type * * @ops: set ops for this type * @features: features supported by the implementation */ struct nft_set_type { const struct nft_set_ops ops; u32 features; }; #define to_set_type(o) container_of(o, struct nft_set_type, ops) struct nft_set_elem_expr { u8 size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; #define nft_setelem_expr_at(__elem_expr, __offset) \ ((struct nft_expr *)&__elem_expr->data[__offset]) #define nft_setelem_expr_foreach(__expr, __elem_expr, __size) \ for (__expr = nft_setelem_expr_at(__elem_expr, 0), __size = 0; \ __size < (__elem_expr)->size; \ __size += (__expr)->ops->size, __expr = ((void *)(__expr)) + (__expr)->ops->size) #define NFT_SET_EXPR_MAX 2 /** * struct nft_set - nf_tables set instance * * @list: table set list node * @bindings: list of set bindings * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set * @handle: unique handle of the set * @ktype: key type (numeric type defined by userspace, not used in the kernel) * @dtype: data type (verdict or numeric type defined by userspace) * @objtype: object type (see NFT_OBJECT_* definitions) * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal * @timeout: default timeout value in jiffies * @gc_int: garbage collection interval in msecs * @policy: set parameterization (see enum nft_set_policies) * @udlen: user data length * @udata: user data * @pending_update: list of pending update set element * @ops: set ops * @flags: set flags * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length * @num_exprs: numbers of exprs * @exprs: stateful expression * @catchall_list: list of catch-all set element * @data: private set data */ struct nft_set { struct list_head list; struct list_head bindings; refcount_t refs; struct nft_table *table; possible_net_t net; char *name; u64 handle; u32 ktype; u32 dtype; u32 objtype; u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; u32 use; atomic_t nelems; u32 ndeact; u64 timeout; u32 gc_int; u16 policy; u16 udlen; unsigned char *udata; struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; u16 flags:13, dead:1, genmask:2; u8 klen; u8 dlen; u8 num_exprs; struct nft_expr *exprs[NFT_SET_EXPR_MAX]; struct list_head catchall_list; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline bool nft_set_is_anonymous(const struct nft_set *set) { return set->flags & NFT_SET_ANONYMOUS; } static inline void *nft_set_priv(const struct nft_set *set) { return (void *)set->data; } static inline enum nft_data_types nft_set_datatype(const struct nft_set *set) { return set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE; } static inline bool nft_set_gc_is_pending(const struct nft_set *s) { return refcount_read(&s->refs) != 1; } static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); } struct nft_set *nft_set_lookup_global(const struct net *net, const struct nft_table *table, const struct nlattr *nla_set_name, const struct nlattr *nla_set_id, u8 genmask); struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { u32 gc_int = READ_ONCE(set->gc_int); return gc_int ? msecs_to_jiffies(gc_int) : HZ; } /** * struct nft_set_binding - nf_tables set binding * * @list: set bindings list node * @chain: chain containing the rule bound to the set * @flags: set action flags * * A set binding contains all information necessary for validation * of new elements added to a bound set. */ struct nft_set_binding { struct list_head list; const struct nft_chain *chain; u32 flags; }; enum nft_trans_phase; void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set); void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase); int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); /** * enum nft_set_extensions - set extension type IDs * * @NFT_SET_EXT_KEY: element key * @NFT_SET_EXT_KEY_END: upper bound element key, for ranges * @NFT_SET_EXT_DATA: mapping data * @NFT_SET_EXT_FLAGS: element flags * @NFT_SET_EXT_TIMEOUT: element timeout * @NFT_SET_EXT_USERDATA: user data associated with the element * @NFT_SET_EXT_EXPRESSIONS: expressions associated with the element * @NFT_SET_EXT_OBJREF: stateful object reference associated with element * @NFT_SET_EXT_NUM: number of extension types */ enum nft_set_extensions { NFT_SET_EXT_KEY, NFT_SET_EXT_KEY_END, NFT_SET_EXT_DATA, NFT_SET_EXT_FLAGS, NFT_SET_EXT_TIMEOUT, NFT_SET_EXT_USERDATA, NFT_SET_EXT_EXPRESSIONS, NFT_SET_EXT_OBJREF, NFT_SET_EXT_NUM }; /** * struct nft_set_ext_type - set extension type * * @len: fixed part length of the extension * @align: alignment requirements of the extension */ struct nft_set_ext_type { u8 len; u8 align; }; extern const struct nft_set_ext_type nft_set_ext_types[]; /** * struct nft_set_ext_tmpl - set extension template * * @len: length of extension area * @offset: offsets of individual extension types * @ext_len: length of the expected extension(used to sanity check) */ struct nft_set_ext_tmpl { u16 len; u8 offset[NFT_SET_EXT_NUM]; u8 ext_len[NFT_SET_EXT_NUM]; }; /** * struct nft_set_ext - set extensions * * @genmask: generation mask * @offset: offsets of individual extension types * @data: beginning of extension data */ struct nft_set_ext { u8 genmask; u8 offset[NFT_SET_EXT_NUM]; char data[]; }; static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl) { memset(tmpl, 0, sizeof(*tmpl)); tmpl->len = sizeof(struct nft_set_ext); } static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id, unsigned int len) { tmpl->len = ALIGN(tmpl->len, nft_set_ext_types[id].align); if (tmpl->len > U8_MAX) return -EINVAL; tmpl->offset[id] = tmpl->len; tmpl->ext_len[id] = nft_set_ext_types[id].len + len; tmpl->len += tmpl->ext_len[id]; return 0; } static inline int nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id) { return nft_set_ext_add_length(tmpl, id, 0); } static inline void nft_set_ext_init(struct nft_set_ext *ext, const struct nft_set_ext_tmpl *tmpl) { memcpy(ext->offset, tmpl->offset, sizeof(ext->offset)); } static inline bool __nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return !!ext->offset[id]; } static inline bool nft_set_ext_exists(const struct nft_set_ext *ext, u8 id) { return ext && __nft_set_ext_exists(ext, id); } static inline void *nft_set_ext(const struct nft_set_ext *ext, u8 id) { return (void *)ext + ext->offset[id]; } static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY); } static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_KEY_END); } static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_DATA); } static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_FLAGS); } struct nft_timeout { u64 timeout; u64 expiration; }; static inline struct nft_timeout *nft_set_ext_timeout(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT); } static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_USERDATA); } static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS); } static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext, u64 tstamp) { if (!nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) || READ_ONCE(nft_set_ext_timeout(ext)->timeout) == 0) return false; return time_after_eq64(tstamp, READ_ONCE(nft_set_ext_timeout(ext)->expiration)); } static inline bool nft_set_elem_expired(const struct nft_set_ext *ext) { return __nft_set_elem_expired(ext, get_jiffies_64()); } static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set, const struct nft_elem_priv *elem_priv) { return (void *)elem_priv + set->ops->elemsize; } static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext) { return nft_set_ext(ext, NFT_SET_EXT_OBJREF); } struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx, const struct nft_set *set, const struct nlattr *attr); struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *key_end, const u32 *data, u64 timeout, u64 expiration, gfp_t gfp); int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); void nft_set_elem_destroy(const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool destroy_expr); void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_elem_priv *elem_priv); struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type * * @select_ops: function to select nft_expr_ops * @release_ops: release nft_expr_ops * @ops: default ops, used when no select_ops functions is present * @inner_ops: inner ops, used for inner packet operation * @list: used internally * @name: Identifier * @owner: module reference * @policy: netlink attribute policy * @maxattr: highest netlink attribute number * @family: address family for AF-specific types * @flags: expression type flags */ struct nft_expr_type { const struct nft_expr_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); void (*release_ops)(const struct nft_expr_ops *ops); const struct nft_expr_ops *ops; const struct nft_expr_ops *inner_ops; struct list_head list; const char *name; struct module *owner; const struct nla_policy *policy; unsigned int maxattr; u8 family; u8 flags; }; #define NFT_EXPR_STATEFUL 0x1 #define NFT_EXPR_GC 0x2 enum nft_trans_phase { NFT_TRANS_PREPARE, NFT_TRANS_PREPARE_ERROR, NFT_TRANS_ABORT, NFT_TRANS_COMMIT, NFT_TRANS_RELEASE }; struct nft_flow_rule; struct nft_offload_ctx; /** * struct nft_expr_ops - nf_tables expression operations * * @eval: Expression evaluation function * @clone: Expression clone function * @size: full expression size, including private data size * @init: initialization function * @activate: activate expression in the next generation * @deactivate: deactivate expression in next generation * @destroy: destruction function, called after synchronize_rcu * @destroy_clone: destruction clone function * @dump: function to dump parameters * @validate: validate expression, called during loop detection * @reduce: reduce expression * @gc: garbage collection expression * @offload: hardware offload expression * @offload_action: function to report true/false to allocate one slot or not in the flow * offload array * @offload_stats: function to synchronize hardware stats via updating the counter expression * @type: expression type * @data: extra data to attach to this expression operation */ struct nft_expr_ops { void (*eval)(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); int (*clone)(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); void (*activate)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*deactivate)(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase); void (*destroy)(const struct nft_ctx *ctx, const struct nft_expr *expr); void (*destroy_clone)(const struct nft_ctx *ctx, const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr); bool (*reduce)(struct nft_regs_track *track, const struct nft_expr *expr); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); const struct nft_expr_type *type; void *data; }; /** * struct nft_rule - nf_tables rule * * @list: used internally * @handle: rule handle * @genmask: generation mask * @dlen: length of expression data * @udata: user data is appended to the rule * @data: expression data */ struct nft_rule { struct list_head list; u64 handle:42, genmask:2, dlen:12, udata:1; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[0]; } static inline struct nft_expr *nft_expr_next(const struct nft_expr *expr) { return ((void *)expr) + expr->ops->size; } static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule) { return (struct nft_expr *)&rule->data[rule->dlen]; } static inline bool nft_expr_more(const struct nft_rule *rule, const struct nft_expr *expr) { return expr != nft_expr_last(rule) && expr->ops; } static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule) { return (void *)&rule->data[rule->dlen]; } void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule); void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule, enum nft_trans_phase phase); void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule); static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_set_elem_expr *elem_expr; struct nft_expr *expr; u32 size; if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) { elem_expr = nft_set_ext_expr(ext); nft_setelem_expr_foreach(expr, elem_expr, size) { expr->ops->eval(expr, regs, pkt); if (regs->verdict.code == NFT_BREAK) return; } } } /* * The last pointer isn't really necessary, but the compiler isn't able to * determine that the result of nft_expr_last() is always the same since it * can't assume that the dlen value wasn't changed within calls in the loop. */ #define nft_rule_for_each_expr(expr, last, rule) \ for ((expr) = nft_expr_first(rule), (last) = nft_expr_last(rule); \ (expr) != (last); \ (expr) = nft_expr_next(expr)) #define NFT_CHAIN_POLICY_UNSET U8_MAX struct nft_rule_dp { u64 is_last:1, dlen:12, handle:42; /* for tracing */ unsigned char data[] __attribute__((aligned(__alignof__(struct nft_expr)))); }; struct nft_rule_dp_last { struct nft_rule_dp end; /* end of nft_rule_blob marker */ struct rcu_head h; /* call_rcu head */ struct nft_rule_blob *blob; /* ptr to free via call_rcu */ const struct nft_chain *chain; /* for nftables tracing */ }; static inline const struct nft_rule_dp *nft_rule_next(const struct nft_rule_dp *rule) { return (void *)rule + sizeof(*rule) + rule->dlen; } struct nft_rule_blob { unsigned long size; unsigned char data[] __attribute__((aligned(__alignof__(struct nft_rule_dp)))); }; /** * struct nft_chain - nf_tables chain * * @blob_gen_0: rule blob pointer to the current generation * @blob_gen_1: rule blob pointer to the future generation * @rules: list of rules in the chain * @list: used internally * @rhlhead: used internally * @table: table that this chain belongs to * @handle: chain handle * @use: number of jump references to this chain * @flags: bitmask of enum NFTA_CHAIN_FLAGS * @bound: bind or not * @genmask: generation mask * @name: name of the chain * @udlen: user data length * @udata: user data in the chain * @blob_next: rule blob pointer to the next in the chain */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; struct nft_rule_blob __rcu *blob_gen_1; struct list_head rules; struct list_head list; struct rhlist_head rhlhead; struct nft_table *table; u64 handle; u32 use; u8 flags:5, bound:1, genmask:2; char *name; u16 udlen; u8 *udata; /* Only used during control plane commit phase: */ struct nft_rule_blob *blob_next; }; int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, NFT_CHAIN_T_NAT, NFT_CHAIN_T_MAX }; /** * struct nft_chain_type - nf_tables chain type info * * @name: name of the type * @type: numeric identifier * @family: address family * @owner: module owner * @hook_mask: mask of valid hooks * @hooks: array of hook functions * @ops_register: base chain register function * @ops_unregister: base chain unregister function */ struct nft_chain_type { const char *name; enum nft_chain_types type; int family; struct module *owner; unsigned int hook_mask; nf_hookfn *hooks[NFT_MAX_HOOKS]; int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; int nft_chain_validate_dependency(const struct nft_chain *chain, enum nft_chain_types type); int nft_chain_validate_hooks(const struct nft_chain *chain, unsigned int hook_flags); static inline bool nft_chain_binding(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BINDING; } static inline bool nft_chain_is_bound(struct nft_chain *chain) { return (chain->flags & NFT_CHAIN_BINDING) && chain->bound; } int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); void nf_tables_chain_destroy(struct nft_chain *chain); struct nft_stats { u64 bytes; u64 pkts; struct u64_stats_sync syncp; }; struct nft_hook { struct list_head list; struct nf_hook_ops ops; struct rcu_head rcu; }; /** * struct nft_base_chain - nf_tables base chain * * @ops: netfilter hook ops * @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family) * @type: chain type * @policy: default policy * @flags: indicate the base chain disabled or not * @stats: per-cpu chain stats * @chain: the chain * @flow_block: flow block (for hardware offload) */ struct nft_base_chain { struct nf_hook_ops ops; struct list_head hook_list; const struct nft_chain_type *type; u8 policy; u8 flags; struct nft_stats __percpu *stats; struct nft_chain chain; struct flow_block flow_block; }; static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain) { return container_of(chain, struct nft_base_chain, chain); } static inline bool nft_is_base_chain(const struct nft_chain *chain) { return chain->flags & NFT_CHAIN_BASE; } int __nft_release_basechain(struct nft_ctx *ctx); unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); static inline bool nft_use_inc(u32 *use) { if (*use == UINT_MAX) return false; (*use)++; return true; } static inline void nft_use_dec(u32 *use) { WARN_ON_ONCE((*use)-- == 0); } /* For error and abort path: restore use counter to previous state. */ static inline void nft_use_inc_restore(u32 *use) { WARN_ON_ONCE(!nft_use_inc(use)); } #define nft_use_dec_restore nft_use_dec /** * struct nft_table - nf_tables table * * @list: used internally * @chains_ht: chains in the table * @chains: same, for stable walks * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table * @family:address family * @flags: table flag (see enum nft_table_flags) * @genmask: generation mask * @nlpid: netlink port ID * @name: name of the table * @udlen: length of the user data * @udata: user data * @validate_state: internal, set when transaction adds jumps */ struct nft_table { struct list_head list; struct rhltable chains_ht; struct list_head chains; struct list_head sets; struct list_head objects; struct list_head flowtables; u64 hgenerator; u64 handle; u32 use; u16 family:6, flags:8, genmask:2; u32 nlpid; char *name; u16 udlen; u8 *udata; u8 validate_state; }; static inline bool nft_table_has_owner(const struct nft_table *table) { return table->flags & NFT_TABLE_F_OWNER; } static inline bool nft_table_is_orphan(const struct nft_table *table) { return (table->flags & (NFT_TABLE_F_OWNER | NFT_TABLE_F_PERSIST)) == NFT_TABLE_F_PERSIST; } static inline bool nft_base_chain_netdev(int family, u32 hooknum) { return family == NFPROTO_NETDEV || (family == NFPROTO_INET && hooknum == NF_INET_INGRESS); } void nft_register_chain_type(const struct nft_chain_type *); void nft_unregister_chain_type(const struct nft_chain_type *); int nft_register_expr(struct nft_expr_type *); void nft_unregister_expr(struct nft_expr_type *); int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v); /** * struct nft_object_hash_key - key to lookup nft_object * * @name: name of the stateful object to look up * @table: table the object belongs to */ struct nft_object_hash_key { const char *name; const struct nft_table *table; }; /** * struct nft_object - nf_tables stateful object * * @list: table stateful object list node * @rhlhead: nft_objname_ht node * @key: keys that identify this object * @genmask: generation mask * @use: number of references to this stateful object * @handle: unique object handle * @udlen: length of user data * @udata: user data * @ops: object operations * @data: object data, layout depends on type */ struct nft_object { struct list_head list; struct rhlist_head rhlhead; struct nft_object_hash_key key; u32 genmask:2; u32 use; u64 handle; u16 udlen; u8 *udata; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] __attribute__((aligned(__alignof__(u64)))); }; static inline void *nft_obj_data(const struct nft_object *obj) { return (void *)obj->data; } #define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr)) struct nft_object *nft_obj_lookup(const struct net *net, const struct nft_table *table, const struct nlattr *nla, u32 objtype, u8 genmask); void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp); /** * struct nft_object_type - stateful object type * * @select_ops: function to select nft_object_ops * @ops: default ops, used when no select_ops functions is present * @list: list node in list of object types * @type: stateful object numeric type * @owner: module owner * @maxattr: maximum netlink attribute * @family: address family for AF-specific object types * @policy: netlink attribute policy */ struct nft_object_type { const struct nft_object_ops *(*select_ops)(const struct nft_ctx *, const struct nlattr * const tb[]); const struct nft_object_ops *ops; struct list_head list; u32 type; unsigned int maxattr; u8 family; struct module *owner; const struct nla_policy *policy; }; /** * struct nft_object_ops - stateful object operations * * @eval: stateful object evaluation function * @size: stateful object size * @init: initialize object from netlink attributes * @destroy: release existing stateful object * @dump: netlink dump stateful object * @update: update stateful object * @type: pointer to object type */ struct nft_object_ops { void (*eval)(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt); unsigned int size; int (*init)(const struct nft_ctx *ctx, const struct nlattr *const tb[], struct nft_object *obj); void (*destroy)(const struct nft_ctx *ctx, struct nft_object *obj); int (*dump)(struct sk_buff *skb, struct nft_object *obj, bool reset); void (*update)(struct nft_object *obj, struct nft_object *newobj); const struct nft_object_type *type; }; int nft_register_obj(struct nft_object_type *obj_type); void nft_unregister_obj(struct nft_object_type *obj_type); #define NFT_NETDEVICE_MAX 256 /** * struct nft_flowtable - nf_tables flow table * * @list: flow table list node in table list * @table: the table the flow table is contained in * @name: name of this flow table * @hooknum: hook number * @ops_len: number of hooks in array * @genmask: generation mask * @use: number of references to this flow table * @handle: unique object handle * @hook_list: hook list for hooks per net_device in flowtables * @data: rhashtable and garbage collector */ struct nft_flowtable { struct list_head list; struct nft_table *table; char *name; int hooknum; int ops_len; u32 genmask:2; u32 use; u64 handle; /* runtime data below here */ struct list_head hook_list ____cacheline_aligned; struct nf_flowtable data; }; struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, const struct nlattr *nla, u8 genmask); void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx, struct nft_flowtable *flowtable, enum nft_trans_phase phase); void nft_register_flowtable_type(struct nf_flowtable_type *type); void nft_unregister_flowtable_type(struct nf_flowtable_type *type); /** * struct nft_traceinfo - nft tracing information and state * * @trace: other struct members are initialised * @nf_trace: copy of skb->nf_trace before rule evaluation * @type: event type (enum nft_trace_types) * @skbid: hash of skb to be used as trace id * @packet_dumped: packet headers sent in a previous traceinfo message * @basechain: base chain currently processed */ struct nft_traceinfo { bool trace; bool nf_trace; bool packet_dumped; enum nft_trace_types type:8; u32 skbid; const struct nft_base_chain *basechain; }; void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt, const struct nft_chain *basechain); void nft_trace_notify(const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, const struct nft_rule_dp *rule, struct nft_traceinfo *info); #define MODULE_ALIAS_NFT_CHAIN(family, name) \ MODULE_ALIAS("nft-chain-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_AF_EXPR(family, name) \ MODULE_ALIAS("nft-expr-" __stringify(family) "-" name) #define MODULE_ALIAS_NFT_EXPR(name) \ MODULE_ALIAS("nft-expr-" name) #define MODULE_ALIAS_NFT_OBJ(type) \ MODULE_ALIAS("nft-obj-" __stringify(type)) #if IS_ENABLED(CONFIG_NF_TABLES) /* * The gencursor defines two generations, the currently active and the * next one. Objects contain a bitmask of 2 bits specifying the generations * they're active in. A set bit means they're inactive in the generation * represented by that bit. * * New objects start out as inactive in the current and active in the * next generation. When committing the ruleset the bitmask is cleared, * meaning they're active in all generations. When removing an object, * it is set inactive in the next generation. After committing the ruleset, * the objects are removed. */ static inline unsigned int nft_gencursor_next(const struct net *net) { return net->nft.gencursor + 1 == 1 ? 1 : 0; } static inline u8 nft_genmask_next(const struct net *net) { return 1 << nft_gencursor_next(net); } static inline u8 nft_genmask_cur(const struct net *net) { /* Use READ_ONCE() to prevent refetching the value for atomicity */ return 1 << READ_ONCE(net->nft.gencursor); } #define NFT_GENMASK_ANY ((1 << 0) | (1 << 1)) /* * Generic transaction helpers */ /* Check if this object is currently active. */ #define nft_is_active(__net, __obj) \ (((__obj)->genmask & nft_genmask_cur(__net)) == 0) /* Check if this object is active in the next generation. */ #define nft_is_active_next(__net, __obj) \ (((__obj)->genmask & nft_genmask_next(__net)) == 0) /* This object becomes active in the next generation. */ #define nft_activate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_cur(__net) /* This object becomes inactive in the next generation. */ #define nft_deactivate_next(__net, __obj) \ (__obj)->genmask = nft_genmask_next(__net) /* After committing the ruleset, clear the stale generation bit. */ #define nft_clear(__net, __obj) \ (__obj)->genmask &= ~nft_genmask_next(__net) #define nft_active_genmask(__obj, __genmask) \ !((__obj)->genmask & __genmask) /* * Set element transaction helpers */ static inline bool nft_set_elem_active(const struct nft_set_ext *ext, u8 genmask) { return !(ext->genmask & genmask); } static inline void nft_set_elem_change_active(const struct net *net, const struct nft_set *set, struct nft_set_ext *ext) { ext->genmask ^= nft_genmask_next(net); } #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ #define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) #define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif static inline void nft_set_elem_dead(struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); set_bit(NFT_SET_ELEM_DEAD_BIT, word); } static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); return test_bit(NFT_SET_ELEM_DEAD_BIT, word); } /** * struct nft_trans - nf_tables object update in transaction * * @list: used internally * @net: struct net * @table: struct nft_table the object resides in * @msg_type: message type * @seq: netlink sequence number * @flags: modifiers to new request * @report: notify via unicast netlink message * @put_net: net needs to be put * * This is the information common to all objects in the transaction, * this must always be the first member of derived sub-types. */ struct nft_trans { struct list_head list; struct net *net; struct nft_table *table; int msg_type; u32 seq; u16 flags; u8 report:1; u8 put_net:1; }; /** * struct nft_trans_binding - nf_tables object with binding support in transaction * @nft_trans: base structure, MUST be first member * @binding_list: list of objects with possible bindings * * This is the base type used by objects that can be bound to a chain. */ struct nft_trans_binding { struct nft_trans nft_trans; struct list_head binding_list; }; struct nft_trans_rule { struct nft_trans nft_trans; struct nft_rule *rule; struct nft_chain *chain; struct nft_flow_rule *flow; u32 rule_id; bool bound; }; #define nft_trans_container_rule(trans) \ container_of(trans, struct nft_trans_rule, nft_trans) #define nft_trans_rule(trans) \ nft_trans_container_rule(trans)->rule #define nft_trans_flow_rule(trans) \ nft_trans_container_rule(trans)->flow #define nft_trans_rule_id(trans) \ nft_trans_container_rule(trans)->rule_id #define nft_trans_rule_bound(trans) \ nft_trans_container_rule(trans)->bound #define nft_trans_rule_chain(trans) \ nft_trans_container_rule(trans)->chain struct nft_trans_set { struct nft_trans_binding nft_trans_binding; struct list_head list_trans_newset; struct nft_set *set; u32 set_id; u32 gc_int; u64 timeout; bool update; bool bound; u32 size; }; #define nft_trans_container_set(t) \ container_of(t, struct nft_trans_set, nft_trans_binding.nft_trans) #define nft_trans_set(trans) \ nft_trans_container_set(trans)->set #define nft_trans_set_id(trans) \ nft_trans_container_set(trans)->set_id #define nft_trans_set_bound(trans) \ nft_trans_container_set(trans)->bound #define nft_trans_set_update(trans) \ nft_trans_container_set(trans)->update #define nft_trans_set_timeout(trans) \ nft_trans_container_set(trans)->timeout #define nft_trans_set_gc_int(trans) \ nft_trans_container_set(trans)->gc_int #define nft_trans_set_size(trans) \ nft_trans_container_set(trans)->size struct nft_trans_chain { struct nft_trans_binding nft_trans_binding; struct nft_chain *chain; char *name; struct nft_stats __percpu *stats; u8 policy; bool update; bool bound; u32 chain_id; struct nft_base_chain *basechain; struct list_head hook_list; }; #define nft_trans_container_chain(t) \ container_of(t, struct nft_trans_chain, nft_trans_binding.nft_trans) #define nft_trans_chain(trans) \ nft_trans_container_chain(trans)->chain #define nft_trans_chain_update(trans) \ nft_trans_container_chain(trans)->update #define nft_trans_chain_name(trans) \ nft_trans_container_chain(trans)->name #define nft_trans_chain_stats(trans) \ nft_trans_container_chain(trans)->stats #define nft_trans_chain_policy(trans) \ nft_trans_container_chain(trans)->policy #define nft_trans_chain_bound(trans) \ nft_trans_container_chain(trans)->bound #define nft_trans_chain_id(trans) \ nft_trans_container_chain(trans)->chain_id #define nft_trans_basechain(trans) \ nft_trans_container_chain(trans)->basechain #define nft_trans_chain_hooks(trans) \ nft_trans_container_chain(trans)->hook_list struct nft_trans_table { struct nft_trans nft_trans; bool update; }; #define nft_trans_container_table(trans) \ container_of(trans, struct nft_trans_table, nft_trans) #define nft_trans_table_update(trans) \ nft_trans_container_table(trans)->update enum nft_trans_elem_flags { NFT_TRANS_UPD_TIMEOUT = (1 << 0), NFT_TRANS_UPD_EXPIRATION = (1 << 1), }; struct nft_trans_elem { struct nft_trans nft_trans; struct nft_set *set; struct nft_elem_priv *elem_priv; u64 timeout; u64 expiration; u8 update_flags; bool bound; }; #define nft_trans_container_elem(t) \ container_of(t, struct nft_trans_elem, nft_trans) #define nft_trans_elem_set(trans) \ nft_trans_container_elem(trans)->set #define nft_trans_elem_priv(trans) \ nft_trans_container_elem(trans)->elem_priv #define nft_trans_elem_update_flags(trans) \ nft_trans_container_elem(trans)->update_flags #define nft_trans_elem_timeout(trans) \ nft_trans_container_elem(trans)->timeout #define nft_trans_elem_expiration(trans) \ nft_trans_container_elem(trans)->expiration #define nft_trans_elem_set_bound(trans) \ nft_trans_container_elem(trans)->bound struct nft_trans_obj { struct nft_trans nft_trans; struct nft_object *obj; struct nft_object *newobj; bool update; }; #define nft_trans_container_obj(t) \ container_of(t, struct nft_trans_obj, nft_trans) #define nft_trans_obj(trans) \ nft_trans_container_obj(trans)->obj #define nft_trans_obj_newobj(trans) \ nft_trans_container_obj(trans)->newobj #define nft_trans_obj_update(trans) \ nft_trans_container_obj(trans)->update struct nft_trans_flowtable { struct nft_trans nft_trans; struct nft_flowtable *flowtable; struct list_head hook_list; u32 flags; bool update; }; #define nft_trans_container_flowtable(t) \ container_of(t, struct nft_trans_flowtable, nft_trans) #define nft_trans_flowtable(trans) \ nft_trans_container_flowtable(trans)->flowtable #define nft_trans_flowtable_update(trans) \ nft_trans_container_flowtable(trans)->update #define nft_trans_flowtable_hooks(trans) \ nft_trans_container_flowtable(trans)->hook_list #define nft_trans_flowtable_flags(trans) \ nft_trans_container_flowtable(trans)->flags #define NFT_TRANS_GC_BATCHCOUNT 256 struct nft_trans_gc { struct list_head list; struct net *net; struct nft_set *set; u32 seq; u16 count; struct nft_elem_priv *priv[NFT_TRANS_GC_BATCHCOUNT]; struct rcu_head rcu; }; static inline void nft_ctx_update(struct nft_ctx *ctx, const struct nft_trans *trans) { switch (trans->msg_type) { case NFT_MSG_NEWRULE: case NFT_MSG_DELRULE: case NFT_MSG_DESTROYRULE: ctx->chain = nft_trans_rule_chain(trans); break; case NFT_MSG_NEWCHAIN: case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: ctx->chain = nft_trans_chain(trans); break; default: ctx->chain = NULL; break; } ctx->net = trans->net; ctx->table = trans->table; ctx->family = trans->table->family; ctx->report = trans->report; ctx->flags = trans->flags; ctx->seq = trans->seq; } struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_destroy(struct nft_trans_gc *trans); struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, unsigned int gc_seq, gfp_t gfp); void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, unsigned int gc_seq); struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc); void nft_setelem_data_deactivate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv); int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); void __init nft_chain_route_init(void); void nft_chain_route_fini(void); void nf_tables_trans_destroy_flush_work(void); int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result); __be64 nf_jiffies64_to_msecs(u64 input); #ifdef CONFIG_MODULES __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...); #else static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; } #endif struct nftables_pernet { struct list_head tables; struct list_head commit_list; struct list_head commit_set_list; struct list_head binding_list; struct list_head module_list; struct list_head notify_list; struct mutex commit_mutex; u64 table_handle; u64 tstamp; unsigned int base_seq; unsigned int gc_seq; u8 validate_state; }; extern unsigned int nf_tables_net_id; static inline struct nftables_pernet *nft_pernet(const struct net *net) { return net_generic(net, nf_tables_net_id); } static inline u64 nft_net_tstamp(const struct net *net) { return nft_pernet(net)->tstamp; } #define __NFT_REDUCE_READONLY 1UL #define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) { return expr->ops->reduce == NFT_REDUCE_READONLY; } void nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 len); void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); static inline bool nft_reg_track_cmp(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg) { return track->regs[dreg].selector && track->regs[dreg].selector->ops == expr->ops && track->regs[dreg].num_reg == 0; } #endif /* _NET_NF_TABLES_H */ |
4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_DISK_H #define _SCSI_DISK_H /* * More than enough for everybody ;) The huge number of majors * is a leftover from 16bit dev_t days, we don't really need that * much numberspace. */ #define SD_MAJORS 16 /* * Time out in seconds for disks and Magneto-opticals (which are slower). */ #define SD_TIMEOUT (30 * HZ) #define SD_MOD_TIMEOUT (75 * HZ) /* * Flush timeout is a multiplier over the standard device timeout which is * user modifiable via sysfs but initially set to SD_TIMEOUT */ #define SD_FLUSH_TIMEOUT_MULTIPLIER 2 #define SD_WRITE_SAME_TIMEOUT (120 * HZ) /* * Number of allowed retries */ #define SD_MAX_RETRIES 5 #define SD_PASSTHROUGH_RETRIES 1 #define SD_MAX_MEDIUM_TIMEOUTS 2 /* * Size of the initial data buffer for mode and read capacity data */ #define SD_BUF_SIZE 512 /* * Number of sectors at the end of the device to avoid multi-sector * accesses to in the case of last_sector_bug */ #define SD_LAST_BUGGY_SECTORS 8 enum { SD_EXT_CDB_SIZE = 32, /* Extended CDB size */ SD_MEMPOOL_SIZE = 2, /* CDB pool size */ }; enum { SD_DEF_XFER_BLOCKS = 0xffff, SD_MAX_XFER_BLOCKS = 0xffffffff, SD_MAX_WS10_BLOCKS = 0xffff, SD_MAX_WS16_BLOCKS = 0x7fffff, }; enum { SD_LBP_FULL = 0, /* Full logical block provisioning */ SD_LBP_UNMAP, /* Use UNMAP command */ SD_LBP_WS16, /* Use WRITE SAME(16) with UNMAP bit */ SD_LBP_WS10, /* Use WRITE SAME(10) with UNMAP bit */ SD_LBP_ZERO, /* Use WRITE SAME(10) with zero payload */ SD_LBP_DISABLE, /* Discard disabled due to failed cmd */ }; enum { SD_ZERO_WRITE = 0, /* Use WRITE(10/16) command */ SD_ZERO_WS, /* Use WRITE SAME(10/16) command */ SD_ZERO_WS16_UNMAP, /* Use WRITE SAME(16) with UNMAP */ SD_ZERO_WS10_UNMAP, /* Use WRITE SAME(10) with UNMAP */ }; /** * struct zoned_disk_info - Specific properties of a ZBC SCSI device. * @nr_zones: number of zones. * @zone_blocks: number of logical blocks per zone. * * This data structure holds the ZBC SCSI device properties that are retrieved * twice: a first time before the gendisk capacity is known and a second time * after the gendisk capacity is known. */ struct zoned_disk_info { u32 nr_zones; u32 zone_blocks; }; struct scsi_disk { struct scsi_device *device; /* * disk_dev is used to show attributes in /sys/class/scsi_disk/, * but otherwise not really needed. Do not use for refcounting. */ struct device disk_dev; struct gendisk *disk; struct opal_dev *opal_dev; #ifdef CONFIG_BLK_DEV_ZONED /* Updated during revalidation before the gendisk capacity is known. */ struct zoned_disk_info early_zone_info; /* Updated during revalidation after the gendisk capacity is known. */ struct zoned_disk_info zone_info; u32 zones_optimal_open; u32 zones_optimal_nonseq; u32 zones_max_open; /* * Either zero or a power of two. If not zero it means that the offset * between zone starting LBAs is constant. */ u32 zone_starting_lba_gran; #endif atomic_t openers; sector_t capacity; /* size in logical blocks */ int max_retries; u32 min_xfer_blocks; u32 max_xfer_blocks; u32 opt_xfer_blocks; u32 max_ws_blocks; u32 max_unmap_blocks; u32 unmap_granularity; u32 unmap_alignment; u32 max_atomic; u32 atomic_alignment; u32 atomic_granularity; u32 max_atomic_with_boundary; u32 max_atomic_boundary; u32 index; unsigned int physical_block_size; unsigned int max_medium_access_timeouts; unsigned int medium_access_timed_out; /* number of permanent streams */ u16 permanent_stream_count; u8 media_present; u8 write_prot; u8 protection_type;/* Data Integrity Field */ u8 provisioning_mode; u8 zeroing_mode; u8 nr_actuators; /* Number of actuators */ bool suspended; /* Disk is suspended (stopped) */ unsigned ATO : 1; /* state of disk ATO bit */ unsigned cache_override : 1; /* temp override of WCE,RCD */ unsigned WCE : 1; /* state of disk WCE bit */ unsigned RCD : 1; /* state of disk RCD bit, unused */ unsigned DPOFUA : 1; /* state of disk DPOFUA bit */ unsigned first_scan : 1; unsigned lbpme : 1; unsigned lbprz : 1; unsigned lbpu : 1; unsigned lbpws : 1; unsigned lbpws10 : 1; unsigned lbpvpd : 1; unsigned ws10 : 1; unsigned ws16 : 1; unsigned rc_basis: 2; unsigned zoned: 2; unsigned urswrz : 1; unsigned security : 1; unsigned ignore_medium_access_errors : 1; unsigned rscs : 1; /* reduced stream control support */ unsigned use_atomic_write_boundary : 1; }; #define to_scsi_disk(obj) container_of(obj, struct scsi_disk, disk_dev) static inline struct scsi_disk *scsi_disk(struct gendisk *disk) { return disk->private_data; } #define sd_printk(prefix, sdsk, fmt, a...) \ (sdsk)->disk ? \ sdev_prefix_printk(prefix, (sdsk)->device, \ (sdsk)->disk->disk_name, fmt, ##a) : \ sdev_printk(prefix, (sdsk)->device, fmt, ##a) #define sd_first_printk(prefix, sdsk, fmt, a...) \ do { \ if ((sdsk)->first_scan) \ sd_printk(prefix, sdsk, fmt, ##a); \ } while (0) static inline int scsi_medium_access_command(struct scsi_cmnd *scmd) { switch (scmd->cmnd[0]) { case READ_6: case READ_10: case READ_12: case READ_16: case SYNCHRONIZE_CACHE: case VERIFY: case VERIFY_12: case VERIFY_16: case WRITE_6: case WRITE_10: case WRITE_12: case WRITE_16: case WRITE_SAME: case WRITE_SAME_16: case UNMAP: return 1; case VARIABLE_LENGTH_CMD: switch (scmd->cmnd[9]) { case READ_32: case VERIFY_32: case WRITE_32: case WRITE_SAME_32: return 1; } } return 0; } static inline sector_t logical_to_sectors(struct scsi_device *sdev, sector_t blocks) { return blocks << (ilog2(sdev->sector_size) - 9); } static inline unsigned int logical_to_bytes(struct scsi_device *sdev, sector_t blocks) { return blocks * sdev->sector_size; } static inline sector_t bytes_to_logical(struct scsi_device *sdev, unsigned int bytes) { return bytes >> ilog2(sdev->sector_size); } static inline sector_t sectors_to_logical(struct scsi_device *sdev, sector_t sector) { return sector >> (ilog2(sdev->sector_size) - 9); } void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim); #ifdef CONFIG_BLK_DEV_ZONED int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, u8 buf[SD_BUF_SIZE]); int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all); unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr); int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); #else /* CONFIG_BLK_DEV_ZONED */ static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim, u8 buf[SD_BUF_SIZE]) { return 0; } static inline int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) { return 0; } static inline blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all) { return BLK_STS_TARGET; } static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr) { return good_bytes; } #define sd_zbc_report_zones NULL #endif /* CONFIG_BLK_DEV_ZONED */ void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr); void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result); #endif /* _SCSI_DISK_H */ |
1 1 7 2 5 1 1 1 1 5 2 5 2 2 5 2 1 1 1 1 2 2 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 | // SPDX-License-Identifier: GPL-2.0-only /* * Connection tracking protocol helper module for GRE. * * GRE is a generic encapsulation protocol, which is generally not very * suited for NAT, as it has no protocol-specific part as port numbers. * * It has an optional key field, which may help us distinguishing two * connections between the same two hosts. * * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784 * * PPTP is built on top of a modified version of GRE, and has a mandatory * field called "CallID", which serves us for the same purpose as the key * field in plain GRE. * * Documentation about PPTP can be found in RFC 2637 * * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) * * (C) 2006-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/list.h> #include <linux/seq_file.h> #include <linux/in.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> static const unsigned int gre_timeouts[GRE_CT_MAX] = { [GRE_CT_UNREPLIED] = 30*HZ, [GRE_CT_REPLIED] = 180*HZ, }; /* used when expectation is added */ static DEFINE_SPINLOCK(keymap_lock); static inline struct nf_gre_net *gre_pernet(struct net *net) { return &net->ct.nf_ct_proto.gre; } static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, const struct nf_conntrack_tuple *t) { return km->tuple.src.l3num == t->src.l3num && !memcmp(&km->tuple.src.u3, &t->src.u3, sizeof(t->src.u3)) && !memcmp(&km->tuple.dst.u3, &t->dst.u3, sizeof(t->dst.u3)) && km->tuple.dst.protonum == t->dst.protonum && km->tuple.dst.u.all == t->dst.u.all; } /* look up the source key for a given tuple */ static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t) { struct nf_gre_net *net_gre = gre_pernet(net); struct nf_ct_gre_keymap *km; __be16 key = 0; list_for_each_entry_rcu(km, &net_gre->keymap_list, list) { if (gre_key_cmpfn(km, t)) { key = km->tuple.src.u.gre.key; break; } } pr_debug("lookup src key 0x%x for ", key); nf_ct_dump_tuple(t); return key; } /* add a single keymap entry, associate with specified master ct */ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, struct nf_conntrack_tuple *t) { struct net *net = nf_ct_net(ct); struct nf_gre_net *net_gre = gre_pernet(net); struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); struct nf_ct_gre_keymap **kmp, *km; kmp = &ct_pptp_info->keymap[dir]; if (*kmp) { /* check whether it's a retransmission */ list_for_each_entry_rcu(km, &net_gre->keymap_list, list) { if (gre_key_cmpfn(km, t) && km == *kmp) return 0; } pr_debug("trying to override keymap_%s for ct %p\n", dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); return -EEXIST; } km = kmalloc(sizeof(*km), GFP_ATOMIC); if (!km) return -ENOMEM; memcpy(&km->tuple, t, sizeof(*t)); *kmp = km; pr_debug("adding new entry %p: ", km); nf_ct_dump_tuple(&km->tuple); spin_lock_bh(&keymap_lock); list_add_tail(&km->list, &net_gre->keymap_list); spin_unlock_bh(&keymap_lock); return 0; } EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add); /* destroy the keymap entries associated with specified master ct */ void nf_ct_gre_keymap_destroy(struct nf_conn *ct) { struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); enum ip_conntrack_dir dir; pr_debug("entering for ct %p\n", ct); spin_lock_bh(&keymap_lock); for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) { if (ct_pptp_info->keymap[dir]) { pr_debug("removing %p from list\n", ct_pptp_info->keymap[dir]); list_del_rcu(&ct_pptp_info->keymap[dir]->list); kfree_rcu(ct_pptp_info->keymap[dir], rcu); ct_pptp_info->keymap[dir] = NULL; } } spin_unlock_bh(&keymap_lock); } EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy); /* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */ /* gre hdr info to tuple */ bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple) { const struct pptp_gre_header *pgrehdr; struct pptp_gre_header _pgrehdr; __be16 srckey; const struct gre_base_hdr *grehdr; struct gre_base_hdr _grehdr; /* first only delinearize old RFC1701 GRE header */ grehdr = skb_header_pointer(skb, dataoff, sizeof(_grehdr), &_grehdr); if (!grehdr || (grehdr->flags & GRE_VERSION) != GRE_VERSION_1) { /* try to behave like "nf_conntrack_proto_generic" */ tuple->src.u.all = 0; tuple->dst.u.all = 0; return true; } /* PPTP header is variable length, only need up to the call_id field */ pgrehdr = skb_header_pointer(skb, dataoff, 8, &_pgrehdr); if (!pgrehdr) return true; if (grehdr->protocol != GRE_PROTO_PPP) { pr_debug("Unsupported GRE proto(0x%x)\n", ntohs(grehdr->protocol)); return false; } tuple->dst.u.gre.key = pgrehdr->call_id; srckey = gre_keymap_lookup(net, tuple); tuple->src.u.gre.key = srckey; return true; } #ifdef CONFIG_NF_CONNTRACK_PROCFS /* print private data for conntrack */ static void gre_print_conntrack(struct seq_file *s, struct nf_conn *ct) { seq_printf(s, "timeout=%u, stream_timeout=%u ", (ct->proto.gre.timeout / HZ), (ct->proto.gre.stream_timeout / HZ)); } #endif static unsigned int *gre_get_timeouts(struct net *net) { return gre_pernet(net)->timeouts; } /* Returns verdict for packet, and may modify conntrack */ int nf_conntrack_gre_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state) { unsigned long status; if (!nf_ct_is_confirmed(ct)) { unsigned int *timeouts = nf_ct_timeout_lookup(ct); if (!timeouts) timeouts = gre_get_timeouts(nf_ct_net(ct)); /* initialize to sane value. Ideally a conntrack helper * (e.g. in case of pptp) is increasing them */ ct->proto.gre.stream_timeout = timeouts[GRE_CT_REPLIED]; ct->proto.gre.timeout = timeouts[GRE_CT_UNREPLIED]; } status = READ_ONCE(ct->status); /* If we've seen traffic both ways, this is a GRE connection. * Extend timeout. */ if (status & IPS_SEEN_REPLY) { nf_ct_refresh_acct(ct, ctinfo, skb, ct->proto.gre.stream_timeout); /* never set ASSURED for IPS_NAT_CLASH, they time out soon */ if (unlikely((status & IPS_NAT_CLASH))) return NF_ACCEPT; /* Also, more likely to be important, and not a probe. */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else nf_ct_refresh_acct(ct, ctinfo, skb, ct->proto.gre.timeout); return NF_ACCEPT; } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_cttimeout.h> static int gre_timeout_nlattr_to_obj(struct nlattr *tb[], struct net *net, void *data) { unsigned int *timeouts = data; struct nf_gre_net *net_gre = gre_pernet(net); if (!timeouts) timeouts = gre_get_timeouts(net); /* set default timeouts for GRE. */ timeouts[GRE_CT_UNREPLIED] = net_gre->timeouts[GRE_CT_UNREPLIED]; timeouts[GRE_CT_REPLIED] = net_gre->timeouts[GRE_CT_REPLIED]; if (tb[CTA_TIMEOUT_GRE_UNREPLIED]) { timeouts[GRE_CT_UNREPLIED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_UNREPLIED])) * HZ; } if (tb[CTA_TIMEOUT_GRE_REPLIED]) { timeouts[GRE_CT_REPLIED] = ntohl(nla_get_be32(tb[CTA_TIMEOUT_GRE_REPLIED])) * HZ; } return 0; } static int gre_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data) { const unsigned int *timeouts = data; if (nla_put_be32(skb, CTA_TIMEOUT_GRE_UNREPLIED, htonl(timeouts[GRE_CT_UNREPLIED] / HZ)) || nla_put_be32(skb, CTA_TIMEOUT_GRE_REPLIED, htonl(timeouts[GRE_CT_REPLIED] / HZ))) goto nla_put_failure; return 0; nla_put_failure: return -ENOSPC; } static const struct nla_policy gre_timeout_nla_policy[CTA_TIMEOUT_GRE_MAX+1] = { [CTA_TIMEOUT_GRE_UNREPLIED] = { .type = NLA_U32 }, [CTA_TIMEOUT_GRE_REPLIED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ void nf_conntrack_gre_init_net(struct net *net) { struct nf_gre_net *net_gre = gre_pernet(net); int i; INIT_LIST_HEAD(&net_gre->keymap_list); for (i = 0; i < GRE_CT_MAX; i++) net_gre->timeouts[i] = gre_timeouts[i]; } /* protocol helper struct */ const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre = { .l4proto = IPPROTO_GRE, .allow_clash = true, #ifdef CONFIG_NF_CONNTRACK_PROCFS .print_conntrack = gre_print_conntrack, #endif #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = gre_timeout_nlattr_to_obj, .obj_to_nlattr = gre_timeout_obj_to_nlattr, .nlattr_max = CTA_TIMEOUT_GRE_MAX, .obj_size = sizeof(unsigned int) * GRE_CT_MAX, .nla_policy = gre_timeout_nla_policy, }, #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ }; |
30 30 30 30 30 30 30 4 1 1 1 3 337 336 161 6 179 45 3 44 45 299 47 4 9 27 13 31 1 10 77 1 3 70 13 43 5 30 28 7 44 5 17 4 4 3 3 2 3 2 1 1 192 337 882 339 473 104 533 145 20 20 20 20 7 17 7 20 3 35 35 5 1392 60 1392 353 1288 1 1389 1389 1 1388 1394 1 20 20 20 1387 1388 1386 336 29 516 418 178 498 461 36 13 38 601 520 218 17 587 81 597 120 2 17 107 110 109 25 97 1 17 7 12 6 3 5 4 4 2 3 13 1 1 31 5 3 1 2 1 1 1227 116 720 89 7 38 97 1 4 37 18 31 192 308 27 9 4 40 1035 987 84 104 445 47 44 461 441 573 55 356 357 358 235 240 8 15 15 15 85 82 84 84 8 8 31 1 15 3 5 1 1 1 1 1 7 2 6 314 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> #include <linux/filter.h> #include <net/dsa.h> #include <net/dst_metadata.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/gre.h> #include <net/pptp.h> #include <net/tipc.h> #include <linux/igmp.h> #include <linux/icmp.h> #include <linux/sctp.h> #include <linux/dccp.h> #include <linux/if_tunnel.h> #include <linux/if_pppox.h> #include <linux/ppp_defs.h> #include <linux/stddef.h> #include <linux/if_ether.h> #include <linux/if_hsr.h> #include <linux/mpls.h> #include <linux/tcp.h> #include <linux/ptp_classify.h> #include <net/flow_dissector.h> #include <net/pkt_cls.h> #include <scsi/fc/fc_fcoe.h> #include <uapi/linux/batadv_packet.h> #include <linux/bpf.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_labels.h> #endif #include <linux/bpf-netns.h> static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1ULL << key_id); } void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count) { unsigned int i; memset(flow_dissector, 0, sizeof(*flow_dissector)); for (i = 0; i < key_count; i++, key++) { /* User should make sure that every key target offset is within * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); BUG_ON(dissector_uses_key(flow_dissector, key->key_id)); dissector_set_key(flow_dissector, key->key_id); flow_dissector->offset[key->key_id] = key->offset; } /* Ensure that the dissector always includes control and basic key. * That way we are able to avoid handling lack of these in fast path. */ BUG_ON(!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL)); BUG_ON(!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_BASIC)); } EXPORT_SYMBOL(skb_flow_dissector_init); #ifdef CONFIG_BPF_SYSCALL int flow_dissector_bpf_prog_attach_check(struct net *net, struct bpf_prog *prog) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; if (net == &init_net) { /* BPF flow dissector in the root namespace overrides * any per-net-namespace one. When attaching to root, * make sure we don't have any BPF program attached * to the non-root namespaces. */ struct net *ns; for_each_net(ns) { if (ns == &init_net) continue; if (rcu_access_pointer(ns->bpf.run_array[type])) return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ if (rcu_access_pointer(init_net.bpf.run_array[type])) return -EEXIST; } return 0; } #endif /* CONFIG_BPF_SYSCALL */ /** * __skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset * @data: raw buffer pointer to the packet, if NULL use skb->data * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); if (!data) { data = skb->data; hlen = skb_headlen(skb); } if (poff >= 0) { __be32 *ports, _ports; ports = __skb_header_pointer(skb, thoff + poff, sizeof(_ports), data, hlen, &_ports); if (ports) return *ports; } return 0; } EXPORT_SYMBOL(__skb_flow_get_ports); static bool icmp_has_id(u8 type) { switch (type) { case ICMP_ECHO: case ICMP_ECHOREPLY: case ICMP_TIMESTAMP: case ICMP_TIMESTAMPREPLY: case ICMPV6_ECHO_REQUEST: case ICMPV6_ECHO_REPLY: return true; } return false; } /** * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields * @skb: sk_buff to extract from * @key_icmp: struct flow_dissector_key_icmp to fill * @data: raw buffer pointer to the packet * @thoff: offset to extract at * @hlen: packet header length */ void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, const void *data, int thoff, int hlen) { struct icmphdr *ih, _ih; ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih); if (!ih) return; key_icmp->type = ih->type; key_icmp->code = ih->code; /* As we use 0 to signal that the Id field is not present, * avoid confusion with packets without such field */ if (icmp_has_id(ih->type)) key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1; else key_icmp->id = 0; } EXPORT_SYMBOL(skb_flow_get_icmp_tci); /* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet * using skb_flow_get_icmp_tci(). */ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int thoff, int hlen) { struct flow_dissector_key_icmp *key_icmp; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP)) return; key_icmp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ICMP, target_container); skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } static void __skb_flow_dissect_ah(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_ipsec *key_ah; struct ip_auth_hdr _hdr, *hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_ah = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC, target_container); key_ah->spi = hdr->spi; } static void __skb_flow_dissect_esp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_ipsec *key_esp; struct ip_esp_hdr _hdr, *hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_esp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC, target_container); key_esp->spi = hdr->spi; } static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_l2tpv3 *key_l2tpv3; struct { __be32 session_id; } *hdr, _hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3)) return; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return; key_l2tpv3 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3, target_container); key_l2tpv3->session_id = hdr->session_id; } void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_meta *meta; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META)) return; meta = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_META, target_container); meta->ingress_ifindex = skb->skb_iif; #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) if (tc_skb_ext_tc_enabled()) { struct tc_skb_ext *ext; ext = skb_ext_find(skb, TC_SKB_EXT); if (ext) meta->l2_miss = ext->l2_miss; } #endif } EXPORT_SYMBOL(skb_flow_dissect_meta); static void skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type, u32 ctrl_flags, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_control *ctrl; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) return; ctrl = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL, target_container); ctrl->addr_type = type; ctrl->flags = ctrl_flags; } void skb_flow_dissect_ct(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, u16 *ctinfo_map, size_t mapsize, bool post_ct, u16 zone) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct flow_dissector_key_ct *key; enum ip_conntrack_info ctinfo; struct nf_conn_labels *cl; struct nf_conn *ct; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT)) return; ct = nf_ct_get(skb, &ctinfo); if (!ct && !post_ct) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CT, target_container); if (!ct) { key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED | TCA_FLOWER_KEY_CT_FLAGS_INVALID; key->ct_zone = zone; return; } if (ctinfo < mapsize) key->ct_state = ctinfo_map[ctinfo]; #if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) key->ct_zone = ct->zone.id; #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) key->ct_mark = READ_ONCE(ct->mark); #endif cl = nf_ct_labels_find(ct); if (cl) memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels)); #endif /* CONFIG_NF_CONNTRACK */ } EXPORT_SYMBOL(skb_flow_dissect_ct); void skb_flow_dissect_tunnel_info(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct ip_tunnel_info *info; struct ip_tunnel_key *key; u32 ctrl_flags = 0; /* A quick check to see if there might be something to do. */ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) return; info = skb_tunnel_info(skb); if (!info) return; key = &info->key; if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM; if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT; if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM; if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags)) ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT; switch (ip_tunnel_info_af(info)) { case AF_INET: skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS, ctrl_flags, flow_dissector, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) { struct flow_dissector_key_ipv4_addrs *ipv4; ipv4 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, target_container); ipv4->src = key->u.ipv4.src; ipv4->dst = key->u.ipv4.dst; } break; case AF_INET6: skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS, ctrl_flags, flow_dissector, target_container); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) { struct flow_dissector_key_ipv6_addrs *ipv6; ipv6 = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, target_container); ipv6->src = key->u.ipv6.src; ipv6->dst = key->u.ipv6.dst; } break; default: skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector, target_container); break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) { struct flow_dissector_key_keyid *keyid; keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID, target_container); keyid->keyid = tunnel_id_to_key32(key->tun_id); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) { struct flow_dissector_key_ports *tp; tp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS, target_container); tp->src = key->tp_src; tp->dst = key->tp_dst; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) { struct flow_dissector_key_ip *ip; ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP, target_container); ip->tos = key->tos; ip->ttl = key->ttl; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) { struct flow_dissector_key_enc_opts *enc_opt; IP_TUNNEL_DECLARE_FLAGS(flags) = { }; u32 val; enc_opt = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS, target_container); if (!info->options_len) return; enc_opt->len = info->options_len; ip_tunnel_info_opts_get(enc_opt->data, info); ip_tunnel_set_options_present(flags); ip_tunnel_flags_and(flags, info->key.tun_flags, flags); val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM, IP_TUNNEL_GENEVE_OPT_BIT); enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0; } } EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); void skb_flow_dissect_hash(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_hash *key; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH)) return; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_HASH, target_container); key->hash = skb_get_hash_raw(skb); } EXPORT_SYMBOL(skb_flow_dissect_hash); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen, int lse_index, bool *entropy_label) { struct mpls_label *hdr, _hdr; u32 entry, label, bos; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) return FLOW_DISSECT_RET_OUT_GOOD; if (lse_index >= FLOW_DIS_MPLS_MAX) return FLOW_DISSECT_RET_OUT_GOOD; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; entry = ntohl(hdr->entry); label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { struct flow_dissector_key_mpls *key_mpls; struct flow_dissector_mpls_lse *lse; key_mpls = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS, target_container); lse = &key_mpls->ls[lse_index]; lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; lse->mpls_bos = bos; lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; lse->mpls_label = label; dissector_set_mpls_lse(key_mpls, lse_index); } if (*entropy_label && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { struct flow_dissector_key_keyid *key_keyid; key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); key_keyid->keyid = cpu_to_be32(label); } *entropy_label = label == MPLS_LABEL_ENTROPY; return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN; } static enum flow_dissect_ret __skb_flow_dissect_arp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_arp *key_arp; struct { unsigned char ar_sha[ETH_ALEN]; unsigned char ar_sip[4]; unsigned char ar_tha[ETH_ALEN]; unsigned char ar_tip[4]; } *arp_eth, _arp_eth; const struct arphdr *arp; struct arphdr _arp; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP)) return FLOW_DISSECT_RET_OUT_GOOD; arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, hlen, &_arp); if (!arp) return FLOW_DISSECT_RET_OUT_BAD; if (arp->ar_hrd != htons(ARPHRD_ETHER) || arp->ar_pro != htons(ETH_P_IP) || arp->ar_hln != ETH_ALEN || arp->ar_pln != 4 || (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST))) return FLOW_DISSECT_RET_OUT_BAD; arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), sizeof(_arp_eth), data, hlen, &_arp_eth); if (!arp_eth) return FLOW_DISSECT_RET_OUT_BAD; key_arp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ARP, target_container); memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip)); memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip)); /* Only store the lower byte of the opcode; * this covers ARPOP_REPLY and ARPOP_REQUEST. */ key_arp->op = ntohs(arp->ar_op) & 0xff; ether_addr_copy(key_arp->sha, arp_eth->ar_sha); ether_addr_copy(key_arp->tha, arp_eth->ar_tha); return FLOW_DISSECT_RET_OUT_GOOD; } static enum flow_dissect_ret __skb_flow_dissect_cfm(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, int hlen) { struct flow_dissector_key_cfm *key, *hdr, _hdr; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM)) return FLOW_DISSECT_RET_OUT_GOOD; hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM, target_container); key->mdl_ver = hdr->mdl_ver; key->opcode = hdr->opcode; return FLOW_DISSECT_RET_OUT_GOOD; } static enum flow_dissect_ret __skb_flow_dissect_gre(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 *p_proto, int *p_nhoff, int *p_hlen, unsigned int flags) { struct flow_dissector_key_keyid *key_keyid; struct gre_base_hdr *hdr, _hdr; int offset = 0; u16 gre_ver; hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, *p_hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; /* Only look inside GRE without routing */ if (hdr->flags & GRE_ROUTING) return FLOW_DISSECT_RET_OUT_GOOD; /* Only look inside GRE for version 0 and 1 */ gre_ver = ntohs(hdr->flags & GRE_VERSION); if (gre_ver > 1) return FLOW_DISSECT_RET_OUT_GOOD; *p_proto = hdr->protocol; if (gre_ver) { /* Version1 must be PPTP, and check the flags */ if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) return FLOW_DISSECT_RET_OUT_GOOD; } offset += sizeof(struct gre_base_hdr); if (hdr->flags & GRE_CSUM) offset += sizeof_field(struct gre_full_hdr, csum) + sizeof_field(struct gre_full_hdr, reserved1); if (hdr->flags & GRE_KEY) { const __be32 *keyid; __be32 _keyid; keyid = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_keyid), data, *p_hlen, &_keyid); if (!keyid) return FLOW_DISSECT_RET_OUT_BAD; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); if (gre_ver == 0) key_keyid->keyid = *keyid; else key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; } offset += sizeof_field(struct gre_full_hdr, key); } if (hdr->flags & GRE_SEQ) offset += sizeof_field(struct pptp_gre_header, seq); if (gre_ver == 0) { if (*p_proto == htons(ETH_P_TEB)) { const struct ethhdr *eth; struct ethhdr _eth; eth = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_eth), data, *p_hlen, &_eth); if (!eth) return FLOW_DISSECT_RET_OUT_BAD; *p_proto = eth->h_proto; offset += sizeof(*eth); /* Cap headers that we access via pointers at the * end of the Ethernet header as our maximum alignment * at that point is only 2 bytes. */ if (NET_IP_ALIGN) *p_hlen = *p_nhoff + offset; } } else { /* version 1, must be PPTP */ u8 _ppp_hdr[PPP_HDRLEN]; u8 *ppp_hdr; if (hdr->flags & GRE_ACK) offset += sizeof_field(struct pptp_gre_header, ack); ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, sizeof(_ppp_hdr), data, *p_hlen, _ppp_hdr); if (!ppp_hdr) return FLOW_DISSECT_RET_OUT_BAD; switch (PPP_PROTOCOL(ppp_hdr)) { case PPP_IP: *p_proto = htons(ETH_P_IP); break; case PPP_IPV6: *p_proto = htons(ETH_P_IPV6); break; default: /* Could probably catch some more like MPLS */ break; } offset += PPP_HDRLEN; } *p_nhoff += offset; key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) return FLOW_DISSECT_RET_OUT_GOOD; return FLOW_DISSECT_RET_PROTO_AGAIN; } /** * __skb_flow_dissect_batadv() - dissect batman-adv header * @skb: sk_buff to with the batman-adv header * @key_control: flow dissectors control key * @data: raw buffer pointer to the packet, if NULL use skb->data * @p_proto: pointer used to update the protocol to process next * @p_nhoff: pointer used to update inner network header offset * @hlen: packet header length * @flags: any combination of FLOW_DISSECTOR_F_* * * ETH_P_BATMAN packets are tried to be dissected. Only * &struct batadv_unicast packets are actually processed because they contain an * inner ethernet header and are usually followed by actual network header. This * allows the flow dissector to continue processing the packet. * * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found, * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation, * otherwise FLOW_DISSECT_RET_OUT_BAD */ static enum flow_dissect_ret __skb_flow_dissect_batadv(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, const void *data, __be16 *p_proto, int *p_nhoff, int hlen, unsigned int flags) { struct { struct batadv_unicast_packet batadv_unicast; struct ethhdr eth; } *hdr, _hdr; hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION) return FLOW_DISSECT_RET_OUT_BAD; if (hdr->batadv_unicast.packet_type != BATADV_UNICAST) return FLOW_DISSECT_RET_OUT_BAD; *p_proto = hdr->eth.h_proto; *p_nhoff += sizeof(*hdr); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) return FLOW_DISSECT_RET_OUT_GOOD; return FLOW_DISSECT_RET_PROTO_AGAIN; } static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int thoff, int hlen) { struct flow_dissector_key_tcp *key_tcp; struct tcphdr *th, _th; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP)) return; th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th); if (!th) return; if (unlikely(__tcp_hdrlen(th) < sizeof(_th))) return; key_tcp = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TCP, target_container); key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); } static void __skb_flow_dissect_ports(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; struct flow_dissector_key_ports *key_ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) dissector_ports = FLOW_DISSECTOR_KEY_PORTS; else if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) return; key_ports = skb_flow_dissector_target(flow_dissector, dissector_ports, target_container); key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); } static void __skb_flow_dissect_ipv4(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, const struct iphdr *iph) { struct flow_dissector_key_ip *key_ip; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) return; key_ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IP, target_container); key_ip->tos = iph->tos; key_ip->ttl = iph->ttl; } static void __skb_flow_dissect_ipv6(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, const struct ipv6hdr *iph) { struct flow_dissector_key_ip *key_ip; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) return; key_ip = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IP, target_container); key_ip->tos = ipv6_get_dsfield(iph); key_ip->ttl = iph->hop_limit; } /* Maximum number of protocol headers that can be parsed in * __skb_flow_dissect */ #define MAX_FLOW_DISSECT_HDRS 15 static bool skb_flow_dissect_allowed(int *num_hdrs) { ++*num_hdrs; return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); } static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, target_container); key_control->thoff = flow_keys->thoff; if (flow_keys->is_frag) key_control->flags |= FLOW_DIS_IS_FRAGMENT; if (flow_keys->is_first_frag) key_control->flags |= FLOW_DIS_FIRST_FRAG; if (flow_keys->is_encap) key_control->flags |= FLOW_DIS_ENCAPSULATION; key_basic = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_BASIC, target_container); key_basic->n_proto = flow_keys->n_proto; key_basic->ip_proto = flow_keys->ip_proto; if (flow_keys->addr_proto == ETH_P_IP && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); key_addrs->v4addrs.src = flow_keys->ipv4_src; key_addrs->v4addrs.dst = flow_keys->ipv4_dst; key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } else if (flow_keys->addr_proto == ETH_P_IPV6 && dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src, sizeof(key_addrs->v6addrs.src)); memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst, sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); else if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE, target_container); if (key_ports) { key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); key_tags->flow_label = ntohl(flow_keys->flow_label); } } u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; /* Pass parameters to the BPF program */ memset(flow_keys, 0, sizeof(*flow_keys)); flow_keys->n_proto = proto; flow_keys->nhoff = nhoff; flow_keys->thoff = flow_keys->nhoff; BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG != (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG); BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL != (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP != (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP); flow_keys->flags = flags; result = bpf_prog_run_pin_on_cpu(prog, ctx); flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen); flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->nhoff, hlen); return result; } static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr) { return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0; } /** * __skb_flow_dissect - extract the flow_keys struct and return it * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified * @flow_dissector: list of keys to dissect * @target_container: target structure to put dissected values into * @data: raw buffer pointer to the packet, if NULL use skb->data * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb) * @hlen: packet header length, if @data is NULL use skb_headlen(skb) * @flags: flags that control the dissection process, e.g. * FLOW_DISSECTOR_F_STOP_AT_ENCAP. * * The function will try to retrieve individual keys into target specified * by flow_dissector from either the skbuff or a raw buffer specified by the * rest parameters. * * Caller must take care of zeroing target container memory. */ bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, const void *data, __be16 proto, int nhoff, int hlen, unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; bool mpls_el = false; int mpls_lse = 0; int num_hdrs = 0; u8 ip_proto = 0; bool ret; if (!data) { data = skb->data; proto = skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; nhoff = skb_network_offset(skb); hlen = skb_headlen(skb); #if IS_ENABLED(CONFIG_NET_DSA) if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) && proto == htons(ETH_P_XDSA))) { struct metadata_dst *md_dst = skb_metadata_dst(skb); const struct dsa_device_ops *ops; int offset = 0; ops = skb->dev->dsa_ptr->tag_ops; /* Only DSA header taggers break flow dissection */ if (ops->needed_headroom && (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) { if (ops->flow_dissect) ops->flow_dissect(skb, &proto, &offset); else dsa_tag_generic_flow_dissect(skb, &proto, &offset); hlen -= offset; nhoff += offset; } } #endif } /* It is ensured by skb_flow_dissector_init() that control key will * be always present. */ key_control = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CONTROL, target_container); /* It is ensured by skb_flow_dissector_init() that basic key will * be always present. */ key_basic = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_BASIC, target_container); if (skb) { if (!net) { if (skb->dev) net = dev_net(skb->dev); else if (skb->sk) net = sock_net(skb->sk); } } DEBUG_NET_WARN_ON_ONCE(!net); if (net) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog_array *run_array; rcu_read_lock(); run_array = rcu_dereference(init_net.bpf.run_array[type]); if (!run_array) run_array = rcu_dereference(net->bpf.run_array[type]); if (run_array) { struct bpf_flow_keys flow_keys; struct bpf_flow_dissector ctx = { .flow_keys = &flow_keys, .data = data, .data_end = data + hlen, }; __be16 n_proto = proto; struct bpf_prog *prog; u32 result; if (skb) { ctx.skb = skb; /* we can't use 'proto' in the skb case * because it might be set to skb->vlan_proto * which has been pulled from the data */ n_proto = skb->protocol; } prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); if (result == BPF_FLOW_DISSECTOR_CONTINUE) goto dissect_continue; __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); return result == BPF_OK; } dissect_continue: rcu_read_unlock(); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); struct flow_dissector_key_eth_addrs *key_eth_addrs; key_eth_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS, target_container); memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs)); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) { struct flow_dissector_key_num_of_vlans *key_num_of_vlans; key_num_of_vlans = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS, target_container); key_num_of_vlans->num_of_vlans = 0; } proto_again: fdret = FLOW_DISSECT_RET_CONTINUE; switch (proto) { case htons(ETH_P_IP): { const struct iphdr *iph; struct iphdr _iph; iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += iph->ihl * 4; ip_proto = iph->protocol; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); memcpy(&key_addrs->v4addrs.src, &iph->saddr, sizeof(key_addrs->v4addrs.src)); memcpy(&key_addrs->v4addrs.dst, &iph->daddr, sizeof(key_addrs->v4addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } __skb_flow_dissect_ipv4(skb, flow_dissector, target_container, data, iph); if (ip_is_fragment(iph)) { key_control->flags |= FLOW_DIS_IS_FRAGMENT; if (iph->frag_off & htons(IP_OFFSET)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } else { key_control->flags |= FLOW_DIS_FIRST_FRAG; if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } } } break; } case htons(ETH_P_IPV6): { const struct ipv6hdr *iph; struct ipv6hdr _iph; iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); memcpy(&key_addrs->v6addrs.src, &iph->saddr, sizeof(key_addrs->v6addrs.src)); memcpy(&key_addrs->v6addrs.dst, &iph->daddr, sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } if ((dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL) || (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) && ip6_flowlabel(iph)) { __be32 flow_label = ip6_flowlabel(iph); if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); key_tags->flow_label = ntohl(flow_label); } if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } } __skb_flow_dissect_ipv6(skb, flow_dissector, target_container, data, iph); break; } case htons(ETH_P_8021AD): case htons(ETH_P_8021Q): { const struct vlan_hdr *vlan = NULL; struct vlan_hdr _vlan; __be16 saved_vlan_tpid = proto; if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX && skb && skb_vlan_tag_present(skb)) { proto = skb->protocol; } else { vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) && !(key_control->flags & FLOW_DIS_ENCAPSULATION)) { struct flow_dissector_key_num_of_vlans *key_nvs; key_nvs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS, target_container); key_nvs->num_of_vlans++; } if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) { dissector_vlan = FLOW_DISSECTOR_KEY_VLAN; } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) { dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN; } else { fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } if (dissector_uses_key(flow_dissector, dissector_vlan)) { key_vlan = skb_flow_dissector_target(flow_dissector, dissector_vlan, target_container); if (!vlan) { key_vlan->vlan_id = skb_vlan_tag_get_id(skb); key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb); } else { key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) & VLAN_VID_MASK; key_vlan->vlan_priority = (ntohs(vlan->h_vlan_TCI) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } key_vlan->vlan_tpid = saved_vlan_tpid; key_vlan->vlan_eth_type = proto; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } case htons(ETH_P_PPP_SES): { struct { struct pppoe_hdr hdr; __be16 proto; } *hdr, _hdr; u16 ppp_proto; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } /* least significant bit of the most significant octet * indicates if protocol field was compressed */ ppp_proto = ntohs(hdr->proto); if (ppp_proto & 0x0100) { ppp_proto = ppp_proto >> 8; nhoff += PPPOE_SES_HLEN - 1; } else { nhoff += PPPOE_SES_HLEN; } if (ppp_proto == PPP_IP) { proto = htons(ETH_P_IP); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_IPV6) { proto = htons(ETH_P_IPV6); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_MPLS_UC) { proto = htons(ETH_P_MPLS_UC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto == PPP_MPLS_MC) { proto = htons(ETH_P_MPLS_MC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; } else if (ppp_proto_is_valid(ppp_proto)) { fdret = FLOW_DISSECT_RET_OUT_GOOD; } else { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PPPOE)) { struct flow_dissector_key_pppoe *key_pppoe; key_pppoe = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PPPOE, target_container); key_pppoe->session_id = hdr->hdr.sid; key_pppoe->ppp_proto = htons(ppp_proto); key_pppoe->type = htons(ETH_P_PPP_SES); } break; } case htons(ETH_P_TIPC): { struct tipc_basic_hdr *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TIPC)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TIPC, target_container); key_addrs->tipckey.key = tipc_hdr_rps_key(hdr); key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC; } fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case htons(ETH_P_MPLS_UC): case htons(ETH_P_MPLS_MC): fdret = __skb_flow_dissect_mpls(skb, flow_dissector, target_container, data, nhoff, hlen, mpls_lse, &mpls_el); nhoff += sizeof(struct mpls_label); mpls_lse++; break; case htons(ETH_P_FCOE): if ((hlen - nhoff) < FCOE_HEADER_LEN) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += FCOE_HEADER_LEN; fdret = FLOW_DISSECT_RET_OUT_GOOD; break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): fdret = __skb_flow_dissect_arp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case htons(ETH_P_BATMAN): fdret = __skb_flow_dissect_batadv(skb, key_control, data, &proto, &nhoff, hlen, flags); break; case htons(ETH_P_1588): { struct ptp_header *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } nhoff += sizeof(struct ptp_header); fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case htons(ETH_P_PRP): case htons(ETH_P_HSR): { struct hsr_tag *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } proto = hdr->encap_proto; nhoff += HSR_HLEN; fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; } case htons(ETH_P_CFM): fdret = __skb_flow_dissect_cfm(skb, flow_dissector, target_container, data, nhoff, hlen); break; default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; } /* Process result of proto processing */ switch (fdret) { case FLOW_DISSECT_RET_OUT_GOOD: goto out_good; case FLOW_DISSECT_RET_PROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto proto_again; goto out_good; case FLOW_DISSECT_RET_CONTINUE: case FLOW_DISSECT_RET_IPPROTO_AGAIN: break; case FLOW_DISSECT_RET_OUT_BAD: default: goto out_bad; } ip_proto_again: fdret = FLOW_DISSECT_RET_CONTINUE; switch (ip_proto) { case IPPROTO_GRE: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector, target_container, data, &proto, &nhoff, &hlen, flags); break; case NEXTHDR_HOP: case NEXTHDR_ROUTING: case NEXTHDR_DEST: { u8 _opthdr[2], *opthdr; if (proto != htons(ETH_P_IPV6)) break; opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), data, hlen, &_opthdr); if (!opthdr) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } ip_proto = opthdr[0]; nhoff += (opthdr[1] + 1) << 3; fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; break; } case NEXTHDR_FRAGMENT: { struct frag_hdr _fh, *fh; if (proto != htons(ETH_P_IPV6)) break; fh = __skb_header_pointer(skb, nhoff, sizeof(_fh), data, hlen, &_fh); if (!fh) { fdret = FLOW_DISSECT_RET_OUT_BAD; break; } key_control->flags |= FLOW_DIS_IS_FRAGMENT; nhoff += sizeof(_fh); ip_proto = fh->nexthdr; if (!(fh->frag_off & htons(IP6_OFFSET))) { key_control->flags |= FLOW_DIS_FIRST_FRAG; if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN; break; } } fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } case IPPROTO_IPIP: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } proto = htons(ETH_P_IP); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_IPV6: if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } proto = htons(ETH_P_IPV6); key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) { fdret = FLOW_DISSECT_RET_OUT_GOOD; break; } fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_MPLS: proto = htons(ETH_P_MPLS_UC); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; break; case IPPROTO_TCP: __skb_flow_dissect_tcp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_ICMP: case IPPROTO_ICMPV6: __skb_flow_dissect_icmp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_L2TP: __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_ESP: __skb_flow_dissect_esp(skb, flow_dissector, target_container, data, nhoff, hlen); break; case IPPROTO_AH: __skb_flow_dissect_ah(skb, flow_dissector, target_container, data, nhoff, hlen); break; default: break; } if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT)) __skb_flow_dissect_ports(skb, flow_dissector, target_container, data, nhoff, ip_proto, hlen); /* Process result of IP proto processing */ switch (fdret) { case FLOW_DISSECT_RET_PROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto proto_again; break; case FLOW_DISSECT_RET_IPPROTO_AGAIN: if (skb_flow_dissect_allowed(&num_hdrs)) goto ip_proto_again; break; case FLOW_DISSECT_RET_OUT_GOOD: case FLOW_DISSECT_RET_CONTINUE: break; case FLOW_DISSECT_RET_OUT_BAD: default: goto out_bad; } out_good: ret = true; out: key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; return ret; out_bad: ret = false; goto out; } EXPORT_SYMBOL(__skb_flow_dissect); static siphash_aligned_key_t hashrnd; static __always_inline void __flow_hash_secret_init(void) { net_get_random_once(&hashrnd, sizeof(hashrnd)); } static const void *flow_keys_hash_start(const struct flow_keys *flow) { BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT); return &flow->FLOW_KEYS_HASH_START_FIELD; } static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: diff -= sizeof(flow->addrs.v4addrs); break; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: diff -= sizeof(flow->addrs.v6addrs); break; case FLOW_DISSECTOR_KEY_TIPC: diff -= sizeof(flow->addrs.tipckey); break; } return sizeof(*flow) - diff; } __be32 flow_get_u32_src(const struct flow_keys *flow) { switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: return flow->addrs.v4addrs.src; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.src); case FLOW_DISSECTOR_KEY_TIPC: return flow->addrs.tipckey.key; default: return 0; } } EXPORT_SYMBOL(flow_get_u32_src); __be32 flow_get_u32_dst(const struct flow_keys *flow) { switch (flow->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: return flow->addrs.v4addrs.dst; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: return (__force __be32)ipv6_addr_hash( &flow->addrs.v6addrs.dst); default: return 0; } } EXPORT_SYMBOL(flow_get_u32_dst); /* Sort the source and destination IP and the ports, * to have consistent hash within the two directions */ static inline void __flow_hash_consistentify(struct flow_keys *keys) { int addr_diff, i; switch (keys->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: if ((__force u32)keys->addrs.v4addrs.dst < (__force u32)keys->addrs.v4addrs.src) swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); if ((__force u16)keys->ports.dst < (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; case FLOW_DISSECTOR_KEY_IPV6_ADDRS: addr_diff = memcmp(&keys->addrs.v6addrs.dst, &keys->addrs.v6addrs.src, sizeof(keys->addrs.v6addrs.dst)); if (addr_diff < 0) { for (i = 0; i < 4; i++) swap(keys->addrs.v6addrs.src.s6_addr32[i], keys->addrs.v6addrs.dst.s6_addr32[i]); } if ((__force u16)keys->ports.dst < (__force u16)keys->ports.src) { swap(keys->ports.src, keys->ports.dst); } break; } } static inline u32 __flow_hash_from_keys(struct flow_keys *keys, const siphash_key_t *keyval) { u32 hash; __flow_hash_consistentify(keys); hash = siphash(flow_keys_hash_start(keys), flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; return hash; } u32 flow_hash_from_keys(struct flow_keys *keys) { __flow_hash_secret_init(); return __flow_hash_from_keys(keys, &hashrnd); } EXPORT_SYMBOL(flow_hash_from_keys); u32 flow_hash_from_keys_seed(struct flow_keys *keys, const siphash_key_t *keyval) { return __flow_hash_from_keys(keys, keyval); } EXPORT_SYMBOL(flow_hash_from_keys_seed); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, const siphash_key_t *keyval) { skb_flow_dissect_flow_keys(skb, keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); return __flow_hash_from_keys(keys, keyval); } struct _flow_keys_digest_data { __be16 n_proto; u8 ip_proto; u8 padding; __be32 ports; __be32 src; __be32 dst; }; void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow) { struct _flow_keys_digest_data *data = (struct _flow_keys_digest_data *)digest; BUILD_BUG_ON(sizeof(*data) > sizeof(*digest)); memset(digest, 0, sizeof(*digest)); data->n_proto = flow->basic.n_proto; data->ip_proto = flow->basic.ip_proto; data->ports = flow->ports.ports; data->src = flow->addrs.v4addrs.src; data->dst = flow->addrs.v4addrs.dst; } EXPORT_SYMBOL(make_flow_keys_digest); static struct flow_dissector flow_keys_dissector_symmetric __read_mostly; u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb) { struct flow_keys keys; __flow_hash_secret_init(); memset(&keys, 0, sizeof(keys)); __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric, &keys, NULL, 0, 0, 0, 0); return __flow_hash_from_keys(&keys, &hashrnd); } EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net); /** * __skb_get_hash_net: calculate a flow hash * @net: associated network namespace, derived from @skb if NULL * @skb: sk_buff to calculate flow hash from * * This function calculates a flow hash based on src/dst addresses * and src/dst port numbers. Sets hash in skb to non-zero hash value * on success, zero indicates no valid hash. Also, sets l4_hash in skb * if hash is a canonical 4-tuple hash over transport ports. */ void __skb_get_hash_net(const struct net *net, struct sk_buff *skb) { struct flow_keys keys; u32 hash; memset(&keys, 0, sizeof(keys)); __skb_flow_dissect(net, skb, &flow_keys_dissector, &keys, NULL, 0, 0, 0, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); __flow_hash_secret_init(); hash = __flow_hash_from_keys(&keys, &hashrnd); __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash_net); __u32 skb_get_hash_perturb(const struct sk_buff *skb, const siphash_key_t *perturb) { struct flow_keys keys; return ___skb_get_hash(skb, &keys, perturb); } EXPORT_SYMBOL(skb_get_hash_perturb); u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen) { u32 poff = keys->control.thoff; /* skip L4 headers for fragments after the first */ if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) && !(keys->control.flags & FLOW_DIS_FIRST_FRAG)) return poff; switch (keys->basic.ip_proto) { case IPPROTO_TCP: { /* access doff as u8 to avoid unaligned access */ const u8 *doff; u8 _doff; doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff), data, hlen, &_doff); if (!doff) return poff; poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2); break; } case IPPROTO_UDP: case IPPROTO_UDPLITE: poff += sizeof(struct udphdr); break; /* For the rest, we do not really care about header * extensions at this point for now. */ case IPPROTO_ICMP: poff += sizeof(struct icmphdr); break; case IPPROTO_ICMPV6: poff += sizeof(struct icmp6hdr); break; case IPPROTO_IGMP: poff += sizeof(struct igmphdr); break; case IPPROTO_DCCP: poff += sizeof(struct dccp_hdr); break; case IPPROTO_SCTP: poff += sizeof(struct sctphdr); break; } return poff; } /** * skb_get_poff - get the offset to the payload * @skb: sk_buff to get the payload offset from * * The function will get the offset to the payload as far as it could * be dissected. The main user is currently BPF, so that we can dynamically * truncate packets without needing to push actual payload to the user * space and can analyze headers only, instead. */ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys_basic keys; if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) { memset(keys, 0, sizeof(*keys)); memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, sizeof(keys->addrs.v6addrs.src)); memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, sizeof(keys->addrs.v6addrs.dst)); keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; keys->ports.src = fl6->fl6_sport; keys->ports.dst = fl6->fl6_dport; keys->keyid.keyid = fl6->fl6_gre_key; keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); keys->basic.ip_proto = fl6->flowi6_proto; return flow_hash_from_keys(keys); } EXPORT_SYMBOL(__get_hash_from_flowi6); static const struct flow_dissector_key flow_keys_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_TIPC, .offset = offsetof(struct flow_keys, addrs.tipckey), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, { .key_id = FLOW_DISSECTOR_KEY_VLAN, .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, .offset = offsetof(struct flow_keys, tags), }, { .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, .offset = offsetof(struct flow_keys, keyid), }, }; static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, }; static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, }; struct flow_dissector flow_keys_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_dissector); struct flow_dissector flow_keys_basic_dissector __read_mostly; EXPORT_SYMBOL(flow_keys_basic_dissector); static int __init init_default_flow_dissectors(void) { skb_flow_dissector_init(&flow_keys_dissector, flow_keys_dissector_keys, ARRAY_SIZE(flow_keys_dissector_keys)); skb_flow_dissector_init(&flow_keys_dissector_symmetric, flow_keys_dissector_symmetric_keys, ARRAY_SIZE(flow_keys_dissector_symmetric_keys)); skb_flow_dissector_init(&flow_keys_basic_dissector, flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); return 0; } core_initcall(init_default_flow_dissectors); |
268 273 14 14 99 99 98 99 10 2 8 9 7 4 5 5 2165 2163 2166 2162 3 3 893 894 32 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 | // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/gfp.h> #include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/fixmap.h> #include <asm/mtrr.h> #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); #endif #ifdef CONFIG_HIGHPTE #define PGTABLE_HIGHMEM __GFP_HIGHMEM #else #define PGTABLE_HIGHMEM 0 #endif #ifndef CONFIG_PARAVIRT static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) { tlb_remove_page(tlb, table); } #endif gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; pgtable_t pte_alloc_one(struct mm_struct *mm) { return __pte_alloc_one(mm, __userpte_alloc_gfp); } static int __init setup_userpte(char *arg) { if (!arg) return -EINVAL; /* * "userpte=nohigh" disables allocation of user pagetables in * high memory. */ if (strcmp(arg, "nohigh") == 0) __userpte_alloc_gfp &= ~__GFP_HIGHMEM; else return -EINVAL; return 0; } early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pagetable_pte_dtor(page_ptdesc(pte)); paravirt_release_pte(page_to_pfn(pte)); paravirt_tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { struct ptdesc *ptdesc = virt_to_ptdesc(pmd); paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table * entries need a full cr3 reload to flush. */ #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif pagetable_pmd_dtor(ptdesc); paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { struct ptdesc *ptdesc = virt_to_ptdesc(pud); pagetable_pud_dtor(ptdesc); paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); paravirt_tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ static inline void pgd_list_add(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_add(&ptdesc->pt_list, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_del(&ptdesc->pt_list); } #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) #define MAX_UNSHARED_PTRS_PER_PGD \ MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { virt_to_ptdesc(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { return page_ptdesc(page)->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the references from swapper_pg_dir. */ if (CONFIG_PGTABLE_LEVELS == 2 || (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || CONFIG_PGTABLE_LEVELS >= 4) { clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); } /* list required to sync kernel mapping updates */ if (!SHARED_KERNEL_PMD) { pgd_set_mm(pgd, mm); pgd_list_add(pgd); } } static void pgd_dtor(pgd_t *pgd) { if (SHARED_KERNEL_PMD) return; spin_lock(&pgd_lock); pgd_list_del(pgd); spin_unlock(&pgd_lock); } /* * List of all pgd's needed for non-PAE so it can invalidate entries * in both cached and uncached pgd's; not needed for PAE since the * kernel pmd is shared. If PAE were not to share the pmd a similar * tactic would be needed. This is essentially codepath-based locking * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. * -- nyc */ #ifdef CONFIG_X86_PAE /* * In PAE mode, we need to do a cr3 reload (=tlb flush) when * updating the top-level pagetable entries to guarantee the * processor notices the update. Since this is expensive, and * all 4 top-level entries are used almost immediately in a * new process's life, we just pre-populate them here. * * Also, if we're in a paravirt environment where the kernel pmd is * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate * and initialize the kernel pmds here. */ #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD #define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD /* * We allocate separate PMDs for the kernel part of the user page-table * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. */ #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ KERNEL_PGD_PTRS : 0) #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); /* Note: almost everything apart from _PAGE_PRESENT is reserved at the pmd (PDPT) level. */ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); /* * According to Intel App note "TLBs, Paging-Structure Caches, * and Their Invalidation", April 2007, document 317080-001, * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... */ flush_tlb_mm(mm); } #else /* !CONFIG_X86_PAE */ /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 #define MAX_PREALLOCATED_PMDS 0 #define PREALLOCATED_USER_PMDS 0 #define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; struct ptdesc *ptdesc; for (i = 0; i < count; i++) if (pmds[i]) { ptdesc = virt_to_ptdesc(pmds[i]); pagetable_pmd_dtor(ptdesc); pagetable_free(ptdesc); mm_dec_nr_pmds(mm); } } static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; gfp &= ~__GFP_HIGHMEM; for (i = 0; i < count; i++) { pmd_t *pmd = NULL; struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) failed = true; if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { pagetable_free(ptdesc); ptdesc = NULL; failed = true; } if (ptdesc) { mm_inc_nr_pmds(mm); pmd = ptdesc_address(ptdesc); } pmds[i] = pmd; } if (failed) { free_pmds(mm, pmds, count); return -ENOMEM; } return 0; } /* * Mop up any pmd pages which may still be attached to the pgd. * Normally they will be freed by munmap/exit_mmap, but any pmd we * preallocate which never got a corresponding vma will need to be * freed manually. */ static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) { pgd_t pgd = *pgdp; if (pgd_val(pgd) != 0) { pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); pgd_clear(pgdp); paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); } } static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; for (i = 0; i < PREALLOCATED_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i]); #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!boot_cpu_has(X86_FEATURE_PTI)) return; pgdp = kernel_to_user_pgdp(pgdp); for (i = 0; i < PREALLOCATED_USER_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); #endif } static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { p4d_t *p4d; pud_t *pud; int i; p4d = p4d_offset(pgd, 0); pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, pud, pmd); } } #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); p4d_t *u_p4d; pud_t *u_pud; int i; u_p4d = p4d_offset(u_pgd, 0); u_pud = pud_offset(u_p4d, 0); s_pgd += KERNEL_PGD_BOUNDARY; u_pud += KERNEL_PGD_BOUNDARY; for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { pmd_t *pmd = pmds[i]; memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, u_pud, pmd); } } #else static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { } #endif /* * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also * assumes that pgd should be in one page. * * But kernel with PAE paging that is not running as a Xen domain * only needs to allocate 32 bytes for pgd instead of one page. */ #ifdef CONFIG_X86_PAE #include <linux/slab.h> #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) #define PGD_ALIGN 32 static struct kmem_cache *pgd_cache; void __init pgtable_cache_init(void) { /* * When PAE kernel is running as a Xen domain, it does not use * shared kernel pmd. And this requires a whole page for pgd. */ if (!SHARED_KERNEL_PMD) return; /* * when PAE kernel is not running as a Xen domain, it uses * shared kernel pmd. Shared kernel pmd does not require a whole * page for pgd. We are able to just allocate a 32-byte for pgd. * During boot time, we create a 32-byte slab for pgd table allocation. */ pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, SLAB_PANIC, NULL); } static inline pgd_t *_pgd_alloc(void) { /* * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, PGD_ALLOCATION_ORDER); /* * Now PAE kernel is not running as a Xen domain. We can allocate * a 32-byte slab for pgd to save memory space. */ return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); } static inline void _pgd_free(pgd_t *pgd) { if (!SHARED_KERNEL_PMD) free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); else kmem_cache_free(pgd_cache, pgd); } #else static inline pgd_t *_pgd_alloc(void) { return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) { free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); } #endif /* CONFIG_X86_PAE */ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; pmd_t *pmds[MAX_PREALLOCATED_PMDS]; pgd = _pgd_alloc(); if (pgd == NULL) goto out; mm->pgd = pgd; if (sizeof(pmds) != 0 && preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; if (sizeof(u_pmds) != 0 && preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; if (paravirt_pgd_alloc(mm) != 0) goto out_free_user_pmds; /* * Make sure that pre-populating the pmds is atomic with * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */ spin_lock(&pgd_lock); pgd_ctor(mm, pgd); if (sizeof(pmds) != 0) pgd_prepopulate_pmd(mm, pgd, pmds); if (sizeof(u_pmds) != 0) pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; out_free_user_pmds: if (sizeof(u_pmds) != 0) free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: if (sizeof(pmds) != 0) free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: _pgd_free(pgd); out: return NULL; } void pgd_free(struct mm_struct *mm, pgd_t *pgd) { pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); _pgd_free(pgd); } /* * Used to set accessed or dirty bits in the page table entries * on other architectures. On x86, the accessed and dirty bits * are tracked by hardware. However, do_wp_page calls this function * to also make the pte writeable at the same time the dirty bit is * set. In that case we do actually need to write the PTE. */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(*ptep, entry); if (changed && dirty) set_pte(ptep, entry); return changed; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { int changed = !pmd_same(*pmdp, entry); VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed && dirty) { set_pmd(pmdp, entry); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { int changed = !pud_same(*pudp, entry); VM_BUG_ON(address & ~HPAGE_PUD_MASK); if (changed && dirty) { set_pud(pudp, entry); /* * We had a write-protection fault here and changed the pud * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } #endif int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { int ret = 0; if (pte_young(*ptep)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *) &ptep->pte); return ret; } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { int ret = 0; if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmdp); return ret; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp) { int ret = 0; if (pud_young(*pudp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pudp); return ret; } #endif int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { /* * On x86 CPUs, clearing the accessed bit without a TLB flush * doesn't cause data corruption. [ It could cause incorrect * page aging and the (mistaken) reclaim of hot pages, but the * chance of that should be relatively low. ] * * So as a performance optimization don't flush the TLB when * clearing the accessed bit, it will eventually be flushed by * a context switch or a VM operation anyway. [ In the rare * event of it not getting flushed for a long time the delay * shouldn't really matter because there's no real memory * pressure for swapout to react to. ] */ return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); /* * No flush is necessary. Once an invalid PTE is established, the PTE's * access and dirty bits cannot be updated. */ return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); } #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, pud_t *pudp) { VM_WARN_ON_ONCE(!pud_present(*pudp)); pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return old; } #endif /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve * * Can be used to relocate the fixmap area and poke a hole in the top * of kernel address space to make room for a hypervisor. */ void __init reserve_top_address(unsigned long reserve) { #ifdef CONFIG_X86_32 BUG_ON(fixmaps_set > 0); __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", -reserve, __FIXADDR_TOP + PAGE_SIZE); #endif } int fixmaps_set; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); #ifdef CONFIG_X86_64 /* * Ensure that the static initial page tables are covering the * fixmap completely. */ BUILD_BUG_ON(__end_of_permanent_fixed_addresses > (FIXMAP_PMD_NUM * PTRS_PER_PTE)); #endif if (idx >= __end_of_fixed_addresses) { BUG(); return; } set_pte_vaddr(address, pte); fixmaps_set++; } void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { /* Sanitize 'prot' against any unsupported bits: */ pgprot_val(flags) &= __default_kernel_pte_mask; __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifdef CONFIG_X86_5LEVEL /** * p4d_set_huge - setup kernel P4D mapping * * No 512GB pages yet -- always return 0 */ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } /** * p4d_clear_huge - clear kernel P4D mapping when it is set * * No 512GB pages yet -- always return 0 */ void p4d_clear_huge(p4d_t *p4d) { } #endif /** * pud_set_huge - setup kernel PUD mapping * * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this * function sets up a huge page only if the complete range has the same MTRR * caching mode. * * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger * page mapping attempt fails. * * Returns 1 on success and 0 on failure. */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); if (!uniform) return 0; /* Bail out if we are we on a populated non-leaf entry: */ if (pud_present(*pud) && !pud_leaf(*pud)) return 0; set_pte((pte_t *)pud, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pmd_set_huge - setup kernel PMD mapping * * See text over pud_set_huge() above. * * Returns 1 on success and 0 on failure. */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); if (!uniform) { pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", __func__, addr, addr + PMD_SIZE); return 0; } /* Bail out if we are we on a populated non-leaf entry: */ if (pmd_present(*pmd) && !pmd_leaf(*pmd)) return 0; set_pte((pte_t *)pmd, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pud_clear_huge - clear kernel PUD mapping when it is set * * Returns 1 on success and 0 on failure (no PUD map is found). */ int pud_clear_huge(pud_t *pud) { if (pud_leaf(*pud)) { pud_clear(pud); return 1; } return 0; } /** * pmd_clear_huge - clear kernel PMD mapping when it is set * * Returns 1 on success and 0 on failure (no PMD map is found). */ int pmd_clear_huge(pmd_t *pmd) { if (pmd_leaf(*pmd)) { pmd_clear(pmd); return 1; } return 0; } #ifdef CONFIG_X86_64 /** * pud_free_pmd_page - Clear pud entry and free pmd page. * @pud: Pointer to a PUD. * @addr: Virtual address associated with pud. * * Context: The pud range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. * * NOTE: Callers must allow a single page allocation. */ int pud_free_pmd_page(pud_t *pud, unsigned long addr) { pmd_t *pmd, *pmd_sv; pte_t *pte; int i; pmd = pud_pgtable(*pud); pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); if (!pmd_sv) return 0; for (i = 0; i < PTRS_PER_PMD; i++) { pmd_sv[i] = pmd[i]; if (!pmd_none(pmd[i])) pmd_clear(&pmd[i]); } pud_clear(pud); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); free_page((unsigned long)pte); } } free_page((unsigned long)pmd_sv); pagetable_pmd_dtor(virt_to_ptdesc(pmd)); free_page((unsigned long)pmd); return 1; } /** * pmd_free_pte_page - Clear pmd entry and free pte page. * @pmd: Pointer to a PMD. * @addr: Virtual address associated with pmd. * * Context: The pmd range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { pte_t *pte; pte = (pte_t *)pmd_page_vaddr(*pmd); pmd_clear(pmd); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); free_page((unsigned long)pte); return 1; } #else /* !CONFIG_X86_64 */ /* * Disable free page handling on x86-PAE. This assures that ioremap() * does not update sync'd pmd entries. See vmalloc_sync_one(). */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return pmd_none(*pmd); } #endif /* CONFIG_X86_64 */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pte_mkwrite_shstk(pte); pte = pte_mkwrite_novma(pte); return pte_clear_saveddirty(pte); } pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pmd_mkwrite_shstk(pmd); pmd = pmd_mkwrite_novma(pmd); return pmd_clear_saveddirty(pmd); } void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { /* * Hardware before shadow stack can (rarely) set Dirty=1 * on a Write=0 PTE. So the below condition * only indicates a software bug when shadow stack is * supported by the HW. This checking is covered in * pte_shstk(). */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pte_shstk(pte)); } void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pmd_shstk(pmd)); } void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) { /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); } |
37 32 4 37 1 8 8 1 8 8 10 1 37 37 36 35 9 36 35 10 4 4 4 4 2 2 4 4 3 1 40 8 9 2 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 | // SPDX-License-Identifier: GPL-2.0-or-later /* * tcp_diag.c Module for monitoring TCP transport protocols sockets. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/net.h> #include <linux/sock_diag.h> #include <linux/inet_diag.h> #include <linux/tcp.h> #include <net/netlink.h> #include <net/tcp.h> static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *_info) { struct tcp_info *info = _info; if (inet_sk_state_load(sk) == TCP_LISTEN) { r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog); r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog); } else if (sk->sk_type == SOCK_STREAM) { const struct tcp_sock *tp = tcp_sk(sk); r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq), 0); r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una; } if (info) tcp_get_info(sk, info); } #ifdef CONFIG_TCP_MD5SIG static void tcp_diag_md5sig_fill(struct tcp_diag_md5sig *info, const struct tcp_md5sig_key *key) { info->tcpm_family = key->family; info->tcpm_prefixlen = key->prefixlen; info->tcpm_keylen = key->keylen; memcpy(info->tcpm_key, key->key, key->keylen); if (key->family == AF_INET) info->tcpm_addr[0] = key->addr.a4.s_addr; #if IS_ENABLED(CONFIG_IPV6) else if (key->family == AF_INET6) memcpy(&info->tcpm_addr, &key->addr.a6, sizeof(info->tcpm_addr)); #endif } static int tcp_diag_put_md5sig(struct sk_buff *skb, const struct tcp_md5sig_info *md5sig) { const struct tcp_md5sig_key *key; struct tcp_diag_md5sig *info; struct nlattr *attr; int md5sig_count = 0; hlist_for_each_entry_rcu(key, &md5sig->head, node) md5sig_count++; if (md5sig_count == 0) return 0; attr = nla_reserve(skb, INET_DIAG_MD5SIG, md5sig_count * sizeof(struct tcp_diag_md5sig)); if (!attr) return -EMSGSIZE; info = nla_data(attr); memset(info, 0, md5sig_count * sizeof(struct tcp_diag_md5sig)); hlist_for_each_entry_rcu(key, &md5sig->head, node) { tcp_diag_md5sig_fill(info++, key); if (--md5sig_count == 0) break; } return 0; } #endif static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk, const struct tcp_ulp_ops *ulp_ops) { struct nlattr *nest; int err; nest = nla_nest_start_noflag(skb, INET_DIAG_ULP_INFO); if (!nest) return -EMSGSIZE; err = nla_put_string(skb, INET_ULP_INFO_NAME, ulp_ops->name); if (err) goto nla_failure; if (ulp_ops->get_info) err = ulp_ops->get_info(sk, skb); if (err) goto nla_failure; nla_nest_end(skb, nest); return 0; nla_failure: nla_nest_cancel(skb, nest); return err; } static int tcp_diag_get_aux(struct sock *sk, bool net_admin, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); int err = 0; #ifdef CONFIG_TCP_MD5SIG if (net_admin) { struct tcp_md5sig_info *md5sig; rcu_read_lock(); md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info); if (md5sig) err = tcp_diag_put_md5sig(skb, md5sig); rcu_read_unlock(); if (err < 0) return err; } #endif if (net_admin) { const struct tcp_ulp_ops *ulp_ops; ulp_ops = icsk->icsk_ulp_ops; if (ulp_ops) err = tcp_diag_put_ulp(skb, sk, ulp_ops); if (err) return err; } return 0; } static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); size_t size = 0; #ifdef CONFIG_TCP_MD5SIG if (net_admin && sk_fullsock(sk)) { const struct tcp_md5sig_info *md5sig; const struct tcp_md5sig_key *key; size_t md5sig_count = 0; rcu_read_lock(); md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info); if (md5sig) { hlist_for_each_entry_rcu(key, &md5sig->head, node) md5sig_count++; } rcu_read_unlock(); size += nla_total_size(md5sig_count * sizeof(struct tcp_diag_md5sig)); } #endif if (net_admin && sk_fullsock(sk)) { const struct tcp_ulp_ops *ulp_ops; ulp_ops = icsk->icsk_ulp_ops; if (ulp_ops) { size += nla_total_size(0) + nla_total_size(TCP_ULP_NAME_MAX); if (ulp_ops->get_info_size) size += ulp_ops->get_info_size(sk); } } return size; } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { struct inet_hashinfo *hinfo; hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; inet_diag_dump_icsk(hinfo, skb, cb, r); } static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct inet_hashinfo *hinfo; hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; return inet_diag_dump_one_icsk(hinfo, cb, req); } #ifdef CONFIG_INET_DIAG_DESTROY static int tcp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { struct net *net = sock_net(in_skb->sk); struct inet_hashinfo *hinfo; struct sock *sk; int err; hinfo = net->ipv4.tcp_death_row.hashinfo; sk = inet_diag_find_one_icsk(net, hinfo, req); if (IS_ERR(sk)) return PTR_ERR(sk); err = sock_diag_destroy(sk, ECONNABORTED); sock_gen_put(sk); return err; } #endif static const struct inet_diag_handler tcp_diag_handler = { .owner = THIS_MODULE, .dump = tcp_diag_dump, .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_get_aux = tcp_diag_get_aux, .idiag_get_aux_size = tcp_diag_get_aux_size, .idiag_type = IPPROTO_TCP, .idiag_info_size = sizeof(struct tcp_info), #ifdef CONFIG_INET_DIAG_DESTROY .destroy = tcp_diag_destroy, #endif }; static int __init tcp_diag_init(void) { return inet_diag_register(&tcp_diag_handler); } static void __exit tcp_diag_exit(void) { inet_diag_unregister(&tcp_diag_handler); } module_init(tcp_diag_init); module_exit(tcp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */); |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include <linux/types.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> DEFINE_BPF_STORAGE_CACHE(cgroup_cache); static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); static void bpf_cgrp_storage_lock(void) { migrate_disable(); this_cpu_inc(bpf_cgrp_storage_busy); } static void bpf_cgrp_storage_unlock(void) { this_cpu_dec(bpf_cgrp_storage_busy); migrate_enable(); } static bool bpf_cgrp_storage_trylock(void) { migrate_disable(); if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { this_cpu_dec(bpf_cgrp_storage_busy); migrate_enable(); return false; } return true; } static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) { struct cgroup *cg = owner; return &cg->bpf_cgrp_storage; } void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; rcu_read_lock(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) { rcu_read_unlock(); return; } bpf_cgrp_storage_lock(); bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); rcu_read_unlock(); } static struct bpf_local_storage_data * cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *cgroup_storage; struct bpf_local_storage_map *smap; cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage, bpf_rcu_lock_held()); if (!cgroup_storage) return NULL; smap = (struct bpf_local_storage_map *)map; return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit); } static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; int fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return ERR_CAST(cgroup); bpf_cgrp_storage_lock(); sdata = cgroup_storage_lookup(cgroup, map, true); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return sdata ? sdata->data : NULL; } static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; int fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, map_flags, GFP_ATOMIC); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); } static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) { struct bpf_local_storage_data *sdata; sdata = cgroup_storage_lookup(cgroup, map, false); if (!sdata) return -ENOENT; bpf_selem_unlink(SELEM(sdata), false); return 0; } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) { struct cgroup *cgroup; int err, fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); bpf_cgrp_storage_lock(); err = cgroup_storage_delete(cgroup, map); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return err; } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { return bpf_local_storage_map_alloc(attr, &cgroup_cache, true); } static void cgroup_storage_map_free(struct bpf_map *map) { bpf_local_storage_map_free(map, &cgroup_cache, NULL); } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; if (!cgroup) return (unsigned long)NULL; if (!bpf_cgrp_storage_trylock()) return (unsigned long)NULL; sdata = cgroup_storage_lookup(cgroup, map, true); if (sdata) goto unlock; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, gfp_flags); unlock: bpf_cgrp_storage_unlock(); return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) { int ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!cgroup) return -EINVAL; if (!bpf_cgrp_storage_trylock()) return -EBUSY; ret = cgroup_storage_delete(cgroup, map); bpf_cgrp_storage_unlock(); return ret; } const struct bpf_map_ops cgrp_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_cgrp_storage_lookup_elem, .map_update_elem = bpf_cgrp_storage_update_elem, .map_delete_elem = bpf_cgrp_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, .map_mem_usage = bpf_local_storage_map_mem_usage, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = cgroup_storage_ptr, }; const struct bpf_func_proto bpf_cgrp_storage_get_proto = { .func = bpf_cgrp_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_cgrp_storage_delete_proto = { .func = bpf_cgrp_storage_delete, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], }; |
9 5 4 4 1 4 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | // SPDX-License-Identifier: GPL-2.0 /* MPTCP socket monitoring support * * Copyright (c) 2019 Red Hat * * Author: Davide Caratti <dcaratti@redhat.com> */ #include <linux/kernel.h> #include <linux/net.h> #include <linux/inet_diag.h> #include <net/netlink.h> #include "protocol.h" static int subflow_get_info(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *sf; struct nlattr *start; u32 flags = 0; bool slow; int err; if (inet_sk_state_load(sk) == TCP_LISTEN) return 0; start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP); if (!start) return -EMSGSIZE; slow = lock_sock_fast(sk); rcu_read_lock(); sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data); if (!sf) { err = 0; goto nla_failure; } if (sf->mp_capable) flags |= MPTCP_SUBFLOW_FLAG_MCAP_REM; if (sf->request_mptcp) flags |= MPTCP_SUBFLOW_FLAG_MCAP_LOC; if (sf->mp_join) flags |= MPTCP_SUBFLOW_FLAG_JOIN_REM; if (sf->request_join) flags |= MPTCP_SUBFLOW_FLAG_JOIN_LOC; if (sf->backup) flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM; if (sf->request_bkup) flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC; if (sf->fully_established) flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED; if (sf->conn_finished) flags |= MPTCP_SUBFLOW_FLAG_CONNECTED; if (sf->map_valid) flags |= MPTCP_SUBFLOW_FLAG_MAPVALID; if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, sf->rel_write_seq) || nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq, MPTCP_SUBFLOW_ATTR_PAD) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, sf->map_subflow_seq) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) || nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN, sf->map_data_len) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) || nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) || nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, subflow_get_local_id(sf))) { err = -EMSGSIZE; goto nla_failure; } rcu_read_unlock(); unlock_sock_fast(sk, slow); nla_nest_end(skb, start); return 0; nla_failure: rcu_read_unlock(); unlock_sock_fast(sk, slow); nla_nest_cancel(skb, start); return err; } static size_t subflow_get_info_size(const struct sock *sk) { size_t size = 0; size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */ 0; return size; } void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops) { ops->get_info = subflow_get_info; ops->get_info_size = subflow_get_info_size; } |
1 1 6 5 6 6 6 6 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2020 Google Corporation */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/mgmt.h> #include "mgmt_util.h" #include "msft.h" #define MSFT_RSSI_THRESHOLD_VALUE_MIN -127 #define MSFT_RSSI_THRESHOLD_VALUE_MAX 20 #define MSFT_RSSI_LOW_TIMEOUT_MAX 0x3C #define MSFT_OP_READ_SUPPORTED_FEATURES 0x00 struct msft_cp_read_supported_features { __u8 sub_opcode; } __packed; struct msft_rp_read_supported_features { __u8 status; __u8 sub_opcode; __le64 features; __u8 evt_prefix_len; __u8 evt_prefix[]; } __packed; #define MSFT_OP_LE_MONITOR_ADVERTISEMENT 0x03 #define MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN 0x01 struct msft_le_monitor_advertisement_pattern { __u8 length; __u8 data_type; __u8 start_byte; __u8 pattern[]; }; struct msft_le_monitor_advertisement_pattern_data { __u8 count; __u8 data[]; }; struct msft_cp_le_monitor_advertisement { __u8 sub_opcode; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 cond_type; __u8 data[]; } __packed; struct msft_rp_le_monitor_advertisement { __u8 status; __u8 sub_opcode; __u8 handle; } __packed; #define MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT 0x04 struct msft_cp_le_cancel_monitor_advertisement { __u8 sub_opcode; __u8 handle; } __packed; struct msft_rp_le_cancel_monitor_advertisement { __u8 status; __u8 sub_opcode; } __packed; #define MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE 0x05 struct msft_cp_le_set_advertisement_filter_enable { __u8 sub_opcode; __u8 enable; } __packed; struct msft_rp_le_set_advertisement_filter_enable { __u8 status; __u8 sub_opcode; } __packed; #define MSFT_EV_LE_MONITOR_DEVICE 0x02 struct msft_ev_le_monitor_device { __u8 addr_type; bdaddr_t bdaddr; __u8 monitor_handle; __u8 monitor_state; } __packed; struct msft_monitor_advertisement_handle_data { __u8 msft_handle; __u16 mgmt_handle; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 cond_type; struct list_head list; }; enum monitor_addr_filter_state { AF_STATE_IDLE, AF_STATE_ADDING, AF_STATE_ADDED, AF_STATE_REMOVING, }; #define MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR 0x04 struct msft_monitor_addr_filter_data { __u8 msft_handle; __u8 pattern_handle; /* address filters pertain to */ __u16 mgmt_handle; int state; __s8 rssi_high; __s8 rssi_low; __u8 rssi_low_interval; __u8 rssi_sampling_period; __u8 addr_type; bdaddr_t bdaddr; struct list_head list; }; struct msft_data { __u64 features; __u8 evt_prefix_len; __u8 *evt_prefix; struct list_head handle_map; struct list_head address_filters; __u8 resuming; __u8 suspending; __u8 filter_enabled; /* To synchronize add/remove address filter and monitor device event.*/ struct mutex filter_lock; }; bool msft_monitor_supported(struct hci_dev *hdev) { return !!(msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR); } static bool read_supported_features(struct hci_dev *hdev, struct msft_data *msft) { struct msft_cp_read_supported_features cp; struct msft_rp_read_supported_features *rp; struct sk_buff *skb; cp.sub_opcode = MSFT_OP_READ_SUPPORTED_FEATURES; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to read MSFT supported features (%ld)", PTR_ERR(skb)); return false; } if (skb->len < sizeof(*rp)) { bt_dev_err(hdev, "MSFT supported features length mismatch"); goto failed; } rp = (struct msft_rp_read_supported_features *)skb->data; if (rp->sub_opcode != MSFT_OP_READ_SUPPORTED_FEATURES) goto failed; if (rp->evt_prefix_len > 0) { msft->evt_prefix = kmemdup(rp->evt_prefix, rp->evt_prefix_len, GFP_KERNEL); if (!msft->evt_prefix) goto failed; } msft->evt_prefix_len = rp->evt_prefix_len; msft->features = __le64_to_cpu(rp->features); if (msft->features & MSFT_FEATURE_MASK_CURVE_VALIDITY) hdev->msft_curve_validity = true; kfree_skb(skb); return true; failed: kfree_skb(skb); return false; } /* is_mgmt = true matches the handle exposed to userspace via mgmt. * is_mgmt = false matches the handle used by the msft controller. * This function requires the caller holds hdev->lock */ static struct msft_monitor_advertisement_handle_data *msft_find_handle_data (struct hci_dev *hdev, u16 handle, bool is_mgmt) { struct msft_monitor_advertisement_handle_data *entry; struct msft_data *msft = hdev->msft_data; list_for_each_entry(entry, &msft->handle_map, list) { if (is_mgmt && entry->mgmt_handle == handle) return entry; if (!is_mgmt && entry->msft_handle == handle) return entry; } return NULL; } /* This function requires the caller holds msft->filter_lock */ static struct msft_monitor_addr_filter_data *msft_find_address_data (struct hci_dev *hdev, u8 addr_type, bdaddr_t *addr, u8 pattern_handle) { struct msft_monitor_addr_filter_data *entry; struct msft_data *msft = hdev->msft_data; list_for_each_entry(entry, &msft->address_filters, list) { if (entry->pattern_handle == pattern_handle && addr_type == entry->addr_type && !bacmp(addr, &entry->bdaddr)) return entry; } return NULL; } /* This function requires the caller holds hdev->lock */ static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle, bdaddr_t *bdaddr, __u8 addr_type, bool notify) { struct monitored_device *dev, *tmp; int count = 0; list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) { /* mgmt_handle == 0 indicates remove all devices, whereas, * bdaddr == NULL indicates remove all devices matching the * mgmt_handle. */ if ((!mgmt_handle || dev->handle == mgmt_handle) && (!bdaddr || (!bacmp(bdaddr, &dev->bdaddr) && addr_type == dev->addr_type))) { if (notify && dev->notified) { mgmt_adv_monitor_device_lost(hdev, dev->handle, &dev->bdaddr, dev->addr_type); } list_del(&dev->list); kfree(dev); count++; } } return count; } static int msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, struct adv_monitor *monitor, struct sk_buff *skb) { struct msft_rp_le_monitor_advertisement *rp; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; int status = 0; hci_dev_lock(hdev); rp = (struct msft_rp_le_monitor_advertisement *)skb->data; if (skb->len < sizeof(*rp)) { status = HCI_ERROR_UNSPECIFIED; goto unlock; } status = rp->status; if (status) goto unlock; handle_data = kmalloc(sizeof(*handle_data), GFP_KERNEL); if (!handle_data) { status = HCI_ERROR_UNSPECIFIED; goto unlock; } handle_data->mgmt_handle = monitor->handle; handle_data->msft_handle = rp->handle; handle_data->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; INIT_LIST_HEAD(&handle_data->list); list_add(&handle_data->list, &msft->handle_map); monitor->state = ADV_MONITOR_STATE_OFFLOADED; unlock: if (status) hci_free_adv_monitor(hdev, monitor); hci_dev_unlock(hdev); return status; } /* This function requires the caller holds hci_req_sync_lock */ static void msft_remove_addr_filters_sync(struct hci_dev *hdev, u8 handle) { struct msft_monitor_addr_filter_data *address_filter, *n; struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_data *msft = hdev->msft_data; struct list_head head; struct sk_buff *skb; INIT_LIST_HEAD(&head); /* Cancel all corresponding address monitors */ mutex_lock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &msft->address_filters, list) { if (address_filter->pattern_handle != handle) continue; list_del(&address_filter->list); /* Keep the address filter and let * msft_add_address_filter_sync() remove and free the address * filter. */ if (address_filter->state == AF_STATE_ADDING) { address_filter->state = AF_STATE_REMOVING; continue; } /* Keep the address filter and let * msft_cancel_address_filter_sync() remove and free the address * filter */ if (address_filter->state == AF_STATE_REMOVING) continue; list_add_tail(&address_filter->list, &head); } mutex_unlock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &head, list) { list_del(&address_filter->list); cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = address_filter->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { kfree(address_filter); continue; } kfree_skb(skb); bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", &address_filter->bdaddr); kfree(address_filter); } } static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode, struct adv_monitor *monitor, struct sk_buff *skb) { struct msft_rp_le_cancel_monitor_advertisement *rp; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; int status = 0; u8 msft_handle; rp = (struct msft_rp_le_cancel_monitor_advertisement *)skb->data; if (skb->len < sizeof(*rp)) { status = HCI_ERROR_UNSPECIFIED; goto done; } status = rp->status; if (status) goto done; hci_dev_lock(hdev); handle_data = msft_find_handle_data(hdev, monitor->handle, true); if (handle_data) { if (monitor->state == ADV_MONITOR_STATE_OFFLOADED) monitor->state = ADV_MONITOR_STATE_REGISTERED; /* Do not free the monitor if it is being removed due to * suspend. It will be re-monitored on resume. */ if (!msft->suspending) { hci_free_adv_monitor(hdev, monitor); /* Clear any monitored devices by this Adv Monitor */ msft_monitor_device_del(hdev, handle_data->mgmt_handle, NULL, 0, false); } msft_handle = handle_data->msft_handle; list_del(&handle_data->list); kfree(handle_data); hci_dev_unlock(hdev); msft_remove_addr_filters_sync(hdev, msft_handle); } else { hci_dev_unlock(hdev); } done: return status; } /* This function requires the caller holds hci_req_sync_lock */ static int msft_remove_monitor_sync(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_monitor_advertisement_handle_data *handle_data; struct sk_buff *skb; handle_data = msft_find_handle_data(hdev, monitor->handle, true); /* If no matched handle, just remove without telling controller */ if (!handle_data) return -ENOENT; cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = handle_data->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) return PTR_ERR(skb); return msft_le_cancel_monitor_advertisement_cb(hdev, hdev->msft_opcode, monitor, skb); } /* This function requires the caller holds hci_req_sync_lock */ int msft_suspend_sync(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; struct adv_monitor *monitor; int handle = 0; if (!msft || !msft_monitor_supported(hdev)) return 0; msft->suspending = true; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); if (!monitor) break; msft_remove_monitor_sync(hdev, monitor); handle++; } /* All monitors have been removed */ msft->suspending = false; return 0; } static bool msft_monitor_rssi_valid(struct adv_monitor *monitor) { struct adv_rssi_thresholds *r = &monitor->rssi; if (r->high_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || r->high_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX || r->low_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN || r->low_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX) return false; /* High_threshold_timeout is not supported, * once high_threshold is reached, events are immediately reported. */ if (r->high_threshold_timeout != 0) return false; if (r->low_threshold_timeout > MSFT_RSSI_LOW_TIMEOUT_MAX) return false; /* Sampling period from 0x00 to 0xFF are all allowed */ return true; } static bool msft_monitor_pattern_valid(struct adv_monitor *monitor) { return msft_monitor_rssi_valid(monitor); /* No additional check needed for pattern-based monitor */ } static int msft_add_monitor_sync(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_cp_le_monitor_advertisement *cp; struct msft_le_monitor_advertisement_pattern_data *pattern_data; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_le_monitor_advertisement_pattern *pattern; struct adv_pattern *entry; size_t total_size = sizeof(*cp) + sizeof(*pattern_data); ptrdiff_t offset = 0; u8 pattern_count = 0; struct sk_buff *skb; int err; if (!msft_monitor_pattern_valid(monitor)) return -EINVAL; list_for_each_entry(entry, &monitor->patterns, list) { pattern_count++; total_size += sizeof(*pattern) + entry->length; } cp = kmalloc(total_size, GFP_KERNEL); if (!cp) return -ENOMEM; cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; cp->rssi_high = monitor->rssi.high_threshold; cp->rssi_low = monitor->rssi.low_threshold; cp->rssi_low_interval = (u8)monitor->rssi.low_threshold_timeout; cp->rssi_sampling_period = monitor->rssi.sampling_period; cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN; pattern_data = (void *)cp->data; pattern_data->count = pattern_count; list_for_each_entry(entry, &monitor->patterns, list) { pattern = (void *)(pattern_data->data + offset); /* the length also includes data_type and offset */ pattern->length = entry->length + 2; pattern->data_type = entry->ad_type; pattern->start_byte = entry->offset; memcpy(pattern->pattern, entry->value, entry->length); offset += sizeof(*pattern) + entry->length; } skb = __hci_cmd_sync(hdev, hdev->msft_opcode, total_size, cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { err = PTR_ERR(skb); goto out_free; } err = msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode, monitor, skb); if (err) goto out_free; handle_data = msft_find_handle_data(hdev, monitor->handle, true); if (!handle_data) { err = -ENODATA; goto out_free; } handle_data->rssi_high = cp->rssi_high; handle_data->rssi_low = cp->rssi_low; handle_data->rssi_low_interval = cp->rssi_low_interval; handle_data->rssi_sampling_period = cp->rssi_sampling_period; out_free: kfree(cp); return err; } /* This function requires the caller holds hci_req_sync_lock */ static void reregister_monitor(struct hci_dev *hdev) { struct adv_monitor *monitor; struct msft_data *msft = hdev->msft_data; int handle = 0; if (!msft) return; msft->resuming = true; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); if (!monitor) break; msft_add_monitor_sync(hdev, monitor); handle++; } /* All monitors have been reregistered */ msft->resuming = false; } /* This function requires the caller holds hci_req_sync_lock */ int msft_resume_sync(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (!msft || !msft_monitor_supported(hdev)) return 0; hci_dev_lock(hdev); /* Clear already tracked devices on resume. Once the monitors are * reregistered, devices in range will be found again after resume. */ hdev->advmon_pend_notify = false; msft_monitor_device_del(hdev, 0, NULL, 0, true); hci_dev_unlock(hdev); reregister_monitor(hdev); return 0; } /* This function requires the caller holds hci_req_sync_lock */ void msft_do_open(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (hdev->msft_opcode == HCI_OP_NOP) return; if (!msft) { bt_dev_err(hdev, "MSFT extension not registered"); return; } bt_dev_dbg(hdev, "Initialize MSFT extension"); /* Reset existing MSFT data before re-reading */ kfree(msft->evt_prefix); msft->evt_prefix = NULL; msft->evt_prefix_len = 0; msft->features = 0; if (!read_supported_features(hdev, msft)) { hdev->msft_data = NULL; kfree(msft); return; } if (msft_monitor_supported(hdev)) { msft->resuming = true; msft_set_filter_enable(hdev, true); /* Monitors get removed on power off, so we need to explicitly * tell the controller to re-monitor. */ reregister_monitor(hdev); } } void msft_do_close(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; struct msft_monitor_advertisement_handle_data *handle_data, *tmp; struct msft_monitor_addr_filter_data *address_filter, *n; struct adv_monitor *monitor; if (!msft) return; bt_dev_dbg(hdev, "Cleanup of MSFT extension"); /* The controller will silently remove all monitors on power off. * Therefore, remove handle_data mapping and reset monitor state. */ list_for_each_entry_safe(handle_data, tmp, &msft->handle_map, list) { monitor = idr_find(&hdev->adv_monitors_idr, handle_data->mgmt_handle); if (monitor && monitor->state == ADV_MONITOR_STATE_OFFLOADED) monitor->state = ADV_MONITOR_STATE_REGISTERED; list_del(&handle_data->list); kfree(handle_data); } mutex_lock(&msft->filter_lock); list_for_each_entry_safe(address_filter, n, &msft->address_filters, list) { list_del(&address_filter->list); kfree(address_filter); } mutex_unlock(&msft->filter_lock); hci_dev_lock(hdev); /* Clear any devices that are being monitored and notify device lost */ hdev->advmon_pend_notify = false; msft_monitor_device_del(hdev, 0, NULL, 0, true); hci_dev_unlock(hdev); } static int msft_cancel_address_filter_sync(struct hci_dev *hdev, void *data) { struct msft_monitor_addr_filter_data *address_filter = data; struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_data *msft = hdev->msft_data; struct sk_buff *skb; int err = 0; if (!msft) { bt_dev_err(hdev, "MSFT: msft data is freed"); return -EINVAL; } /* The address filter has been removed by hci dev close */ if (!test_bit(HCI_UP, &hdev->flags)) return 0; mutex_lock(&msft->filter_lock); list_del(&address_filter->list); mutex_unlock(&msft->filter_lock); cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT; cp.handle = address_filter->msft_handle; skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { bt_dev_err(hdev, "MSFT: Failed to cancel address (%pMR) filter", &address_filter->bdaddr); err = PTR_ERR(skb); goto done; } kfree_skb(skb); bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter", &address_filter->bdaddr); done: kfree(address_filter); return err; } void msft_register(struct hci_dev *hdev) { struct msft_data *msft = NULL; bt_dev_dbg(hdev, "Register MSFT extension"); msft = kzalloc(sizeof(*msft), GFP_KERNEL); if (!msft) { bt_dev_err(hdev, "Failed to register MSFT extension"); return; } INIT_LIST_HEAD(&msft->handle_map); INIT_LIST_HEAD(&msft->address_filters); hdev->msft_data = msft; mutex_init(&msft->filter_lock); } void msft_release(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; if (!msft) return; bt_dev_dbg(hdev, "Unregister MSFT extension"); hdev->msft_data = NULL; kfree(msft->evt_prefix); mutex_destroy(&msft->filter_lock); kfree(msft); } /* This function requires the caller holds hdev->lock */ static void msft_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 addr_type, __u16 mgmt_handle) { struct monitored_device *dev; dev = kmalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { bt_dev_err(hdev, "MSFT vendor event %u: no memory", MSFT_EV_LE_MONITOR_DEVICE); return; } bacpy(&dev->bdaddr, bdaddr); dev->addr_type = addr_type; dev->handle = mgmt_handle; dev->notified = false; INIT_LIST_HEAD(&dev->list); list_add(&dev->list, &hdev->monitored_devices); hdev->advmon_pend_notify = true; } /* This function requires the caller holds hdev->lock */ static void msft_device_lost(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 addr_type, __u16 mgmt_handle) { if (!msft_monitor_device_del(hdev, mgmt_handle, bdaddr, addr_type, true)) { bt_dev_err(hdev, "MSFT vendor event %u: dev %pMR not in list", MSFT_EV_LE_MONITOR_DEVICE, bdaddr); } } static void *msft_skb_pull(struct hci_dev *hdev, struct sk_buff *skb, u8 ev, size_t len) { void *data; data = skb_pull_data(skb, len); if (!data) bt_dev_err(hdev, "Malformed MSFT vendor event: 0x%02x", ev); return data; } static int msft_add_address_filter_sync(struct hci_dev *hdev, void *data) { struct msft_monitor_addr_filter_data *address_filter = data; struct msft_rp_le_monitor_advertisement *rp; struct msft_cp_le_monitor_advertisement *cp; struct msft_data *msft = hdev->msft_data; struct sk_buff *skb = NULL; bool remove = false; size_t size; if (!msft) { bt_dev_err(hdev, "MSFT: msft data is freed"); return -EINVAL; } /* The address filter has been removed by hci dev close */ if (!test_bit(HCI_UP, &hdev->flags)) return -ENODEV; /* We are safe to use the address filter from now on. * msft_monitor_device_evt() wouldn't delete this filter because it's * not been added by now. * And all other functions that requiring hci_req_sync_lock wouldn't * touch this filter before this func completes because it's protected * by hci_req_sync_lock. */ if (address_filter->state == AF_STATE_REMOVING) { mutex_lock(&msft->filter_lock); list_del(&address_filter->list); mutex_unlock(&msft->filter_lock); kfree(address_filter); return 0; } size = sizeof(*cp) + sizeof(address_filter->addr_type) + sizeof(address_filter->bdaddr); cp = kzalloc(size, GFP_KERNEL); if (!cp) { bt_dev_err(hdev, "MSFT: Alloc cmd param err"); remove = true; goto done; } cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT; cp->rssi_high = address_filter->rssi_high; cp->rssi_low = address_filter->rssi_low; cp->rssi_low_interval = address_filter->rssi_low_interval; cp->rssi_sampling_period = address_filter->rssi_sampling_period; cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR; cp->data[0] = address_filter->addr_type; memcpy(&cp->data[1], &address_filter->bdaddr, sizeof(address_filter->bdaddr)); skb = __hci_cmd_sync(hdev, hdev->msft_opcode, size, cp, HCI_CMD_TIMEOUT); kfree(cp); if (IS_ERR(skb)) { bt_dev_err(hdev, "Failed to enable address %pMR filter", &address_filter->bdaddr); skb = NULL; remove = true; goto done; } rp = skb_pull_data(skb, sizeof(*rp)); if (!rp || rp->sub_opcode != MSFT_OP_LE_MONITOR_ADVERTISEMENT || rp->status) remove = true; done: mutex_lock(&msft->filter_lock); if (remove) { bt_dev_warn(hdev, "MSFT: Remove address (%pMR) filter", &address_filter->bdaddr); list_del(&address_filter->list); kfree(address_filter); } else { address_filter->state = AF_STATE_ADDED; address_filter->msft_handle = rp->handle; bt_dev_dbg(hdev, "MSFT: Address %pMR filter enabled", &address_filter->bdaddr); } mutex_unlock(&msft->filter_lock); kfree_skb(skb); return 0; } /* This function requires the caller holds msft->filter_lock */ static struct msft_monitor_addr_filter_data *msft_add_address_filter (struct hci_dev *hdev, u8 addr_type, bdaddr_t *bdaddr, struct msft_monitor_advertisement_handle_data *handle_data) { struct msft_monitor_addr_filter_data *address_filter = NULL; struct msft_data *msft = hdev->msft_data; int err; address_filter = kzalloc(sizeof(*address_filter), GFP_KERNEL); if (!address_filter) return NULL; address_filter->state = AF_STATE_ADDING; address_filter->msft_handle = 0xff; address_filter->pattern_handle = handle_data->msft_handle; address_filter->mgmt_handle = handle_data->mgmt_handle; address_filter->rssi_high = handle_data->rssi_high; address_filter->rssi_low = handle_data->rssi_low; address_filter->rssi_low_interval = handle_data->rssi_low_interval; address_filter->rssi_sampling_period = handle_data->rssi_sampling_period; address_filter->addr_type = addr_type; bacpy(&address_filter->bdaddr, bdaddr); /* With the above AF_STATE_ADDING, duplicated address filter can be * avoided when receiving monitor device event (found/lost) frequently * for the same device. */ list_add_tail(&address_filter->list, &msft->address_filters); err = hci_cmd_sync_queue(hdev, msft_add_address_filter_sync, address_filter, NULL); if (err < 0) { bt_dev_err(hdev, "MSFT: Add address %pMR filter err", bdaddr); list_del(&address_filter->list); kfree(address_filter); return NULL; } bt_dev_dbg(hdev, "MSFT: Add device %pMR address filter", &address_filter->bdaddr); return address_filter; } /* This function requires the caller holds hdev->lock */ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct msft_monitor_addr_filter_data *n, *address_filter = NULL; struct msft_ev_le_monitor_device *ev; struct msft_monitor_advertisement_handle_data *handle_data; struct msft_data *msft = hdev->msft_data; u16 mgmt_handle = 0xffff; u8 addr_type; ev = msft_skb_pull(hdev, skb, MSFT_EV_LE_MONITOR_DEVICE, sizeof(*ev)); if (!ev) return; bt_dev_dbg(hdev, "MSFT vendor event 0x%02x: handle 0x%04x state %d addr %pMR", MSFT_EV_LE_MONITOR_DEVICE, ev->monitor_handle, ev->monitor_state, &ev->bdaddr); handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false); if (!test_bit(HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, &hdev->quirks)) { if (!handle_data) return; mgmt_handle = handle_data->mgmt_handle; goto report_state; } if (handle_data) { /* Don't report any device found/lost event from pattern * monitors. Pattern monitor always has its address filters for * tracking devices. */ address_filter = msft_find_address_data(hdev, ev->addr_type, &ev->bdaddr, handle_data->msft_handle); if (address_filter) return; if (ev->monitor_state && handle_data->cond_type == MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN) msft_add_address_filter(hdev, ev->addr_type, &ev->bdaddr, handle_data); return; } /* This device event is not from pattern monitor. * Report it if there is a corresponding address_filter for it. */ list_for_each_entry(n, &msft->address_filters, list) { if (n->state == AF_STATE_ADDED && n->msft_handle == ev->monitor_handle) { mgmt_handle = n->mgmt_handle; address_filter = n; break; } } if (!address_filter) { bt_dev_warn(hdev, "MSFT: Unexpected device event %pMR, %u, %u", &ev->bdaddr, ev->monitor_handle, ev->monitor_state); return; } report_state: switch (ev->addr_type) { case ADDR_LE_DEV_PUBLIC: addr_type = BDADDR_LE_PUBLIC; break; case ADDR_LE_DEV_RANDOM: addr_type = BDADDR_LE_RANDOM; break; default: bt_dev_err(hdev, "MSFT vendor event 0x%02x: unknown addr type 0x%02x", MSFT_EV_LE_MONITOR_DEVICE, ev->addr_type); return; } if (ev->monitor_state) { msft_device_found(hdev, &ev->bdaddr, addr_type, mgmt_handle); } else { if (address_filter && address_filter->state == AF_STATE_ADDED) { address_filter->state = AF_STATE_REMOVING; hci_cmd_sync_queue(hdev, msft_cancel_address_filter_sync, address_filter, NULL); } msft_device_lost(hdev, &ev->bdaddr, addr_type, mgmt_handle); } } void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct msft_data *msft = hdev->msft_data; u8 *evt_prefix; u8 *evt; if (!msft) return; /* When the extension has defined an event prefix, check that it * matches, and otherwise just return. */ if (msft->evt_prefix_len > 0) { evt_prefix = msft_skb_pull(hdev, skb, 0, msft->evt_prefix_len); if (!evt_prefix) return; if (memcmp(evt_prefix, msft->evt_prefix, msft->evt_prefix_len)) return; } /* Every event starts at least with an event code and the rest of * the data is variable and depends on the event code. */ if (skb->len < 1) return; evt = msft_skb_pull(hdev, skb, 0, sizeof(*evt)); if (!evt) return; hci_dev_lock(hdev); switch (*evt) { case MSFT_EV_LE_MONITOR_DEVICE: mutex_lock(&msft->filter_lock); msft_monitor_device_evt(hdev, skb); mutex_unlock(&msft->filter_lock); break; default: bt_dev_dbg(hdev, "MSFT vendor event 0x%02x", *evt); break; } hci_dev_unlock(hdev); } __u64 msft_get_features(struct hci_dev *hdev) { struct msft_data *msft = hdev->msft_data; return msft ? msft->features : 0; } static void msft_le_set_advertisement_filter_enable_cb(struct hci_dev *hdev, void *user_data, u8 status) { struct msft_cp_le_set_advertisement_filter_enable *cp = user_data; struct msft_data *msft = hdev->msft_data; /* Error 0x0C would be returned if the filter enabled status is * already set to whatever we were trying to set. * Although the default state should be disabled, some controller set * the initial value to enabled. Because there is no way to know the * actual initial value before sending this command, here we also treat * error 0x0C as success. */ if (status != 0x00 && status != 0x0C) return; hci_dev_lock(hdev); msft->filter_enabled = cp->enable; if (status == 0x0C) bt_dev_warn(hdev, "MSFT filter_enable is already %s", cp->enable ? "on" : "off"); hci_dev_unlock(hdev); } /* This function requires the caller holds hci_req_sync_lock */ int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_data *msft = hdev->msft_data; if (!msft) return -EOPNOTSUPP; if (msft->resuming || msft->suspending) return -EBUSY; return msft_add_monitor_sync(hdev, monitor); } /* This function requires the caller holds hci_req_sync_lock */ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { struct msft_data *msft = hdev->msft_data; if (!msft) return -EOPNOTSUPP; if (msft->resuming || msft->suspending) return -EBUSY; return msft_remove_monitor_sync(hdev, monitor); } int msft_set_filter_enable(struct hci_dev *hdev, bool enable) { struct msft_cp_le_set_advertisement_filter_enable cp; struct msft_data *msft = hdev->msft_data; int err; if (!msft) return -EOPNOTSUPP; cp.sub_opcode = MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE; cp.enable = enable; err = __hci_cmd_sync_status(hdev, hdev->msft_opcode, sizeof(cp), &cp, HCI_CMD_TIMEOUT); msft_le_set_advertisement_filter_enable_cb(hdev, &cp, err); return 0; } bool msft_curve_validity(struct hci_dev *hdev) { return hdev->msft_curve_validity; } |
8 8 3 5 8 1 8 4 4 3 3 3 3 2 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct linkinfo_req_info { struct ethnl_req_info base; }; struct linkinfo_reply_data { struct ethnl_reply_data base; struct ethtool_link_ksettings ksettings; struct ethtool_link_settings *lsettings; }; #define LINKINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct linkinfo_reply_data, base) const struct nla_policy ethnl_linkinfo_get_policy[] = { [ETHTOOL_A_LINKINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int linkinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; data->lsettings = &data->ksettings.base; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = __ethtool_get_link_ksettings(dev, &data->ksettings); if (ret < 0) GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); ethnl_ops_complete(dev); return ret; } static int linkinfo_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u8)) /* LINKINFO_PORT */ + nla_total_size(sizeof(u8)) /* LINKINFO_PHYADDR */ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX */ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX_CTRL */ + nla_total_size(sizeof(u8)) /* LINKINFO_TRANSCEIVER */ + 0; } static int linkinfo_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); if (nla_put_u8(skb, ETHTOOL_A_LINKINFO_PORT, data->lsettings->port) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_PHYADDR, data->lsettings->phy_address) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX, data->lsettings->eth_tp_mdix) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, data->lsettings->eth_tp_mdix_ctrl) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TRANSCEIVER, data->lsettings->transceiver)) return -EMSGSIZE; return 0; } /* LINKINFO_SET */ const struct nla_policy ethnl_linkinfo_set_policy[] = { [ETHTOOL_A_LINKINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_U8 }, [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_U8 }, [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 }, }; static int ethnl_set_linkinfo_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; if (!ops->get_link_ksettings || !ops->set_link_ksettings) return -EOPNOTSUPP; return 1; } static int ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_link_ksettings ksettings = {}; struct ethtool_link_settings *lsettings; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; int ret; ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); return ret; } lsettings = &ksettings.base; ethnl_update_u8(&lsettings->port, tb[ETHTOOL_A_LINKINFO_PORT], &mod); ethnl_update_u8(&lsettings->phy_address, tb[ETHTOOL_A_LINKINFO_PHYADDR], &mod); ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl, tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod); if (!mod) return 0; ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "link settings update failed"); return ret; } return 1; } const struct ethnl_request_ops ethnl_linkinfo_request_ops = { .request_cmd = ETHTOOL_MSG_LINKINFO_GET, .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY, .hdr_attr = ETHTOOL_A_LINKINFO_HEADER, .req_info_size = sizeof(struct linkinfo_req_info), .reply_data_size = sizeof(struct linkinfo_reply_data), .prepare_data = linkinfo_prepare_data, .reply_size = linkinfo_reply_size, .fill_reply = linkinfo_fill_reply, .set_validate = ethnl_set_linkinfo_validate, .set = ethnl_set_linkinfo, .set_ntf_cmd = ETHTOOL_MSG_LINKINFO_NTF, }; |
134 3 3 3 13 13 13 21 1 1 1 17 1 1 1 1 1 1 1 9 4 13 13 1 6 6 4 2 6 6 6 846 848 845 177 128 9 9 13 13 13 4 9 13 13 13 13 3 6 2 2 3 6 2 2 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_mirred.c packet mirroring and redirect actions * * Authors: Jamal Hadi Salim (2002-4) * * TODO: Add ingress support (and socket redirect support) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/gfp.h> #include <linux/if_arp.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/dst.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <linux/tc_act/tc_mirred.h> #include <net/tc_act/tc_mirred.h> #include <net/tc_wrapper.h> static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); #define MIRRED_NEST_LIMIT 4 static DEFINE_PER_CPU(unsigned int, mirred_nest_level); static bool tcf_mirred_is_act_redirect(int action) { return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; } static bool tcf_mirred_act_wants_ingress(int action) { switch (action) { case TCA_EGRESS_REDIR: case TCA_EGRESS_MIRROR: return false; case TCA_INGRESS_REDIR: case TCA_INGRESS_MIRROR: return true; default: BUG(); } } static bool tcf_mirred_can_reinsert(int action) { switch (action) { case TC_ACT_SHOT: case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: return true; } return false; } static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m) { return rcu_dereference_protected(m->tcfm_dev, lockdep_is_held(&m->tcf_lock)); } static void tcf_mirred_release(struct tc_action *a) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; spin_lock(&mirred_list_lock); list_del(&m->tcfm_list); spin_unlock(&mirred_list_lock); /* last reference to action, no need to lock */ dev = rcu_dereference_protected(m->tcfm_dev, 1); netdev_put(dev, &m->tcfm_dev_tracker); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, [TCA_MIRRED_BLOCKID] = NLA_POLICY_MIN(NLA_U32, 1), }; static struct tc_action_ops act_mirred_ops; static void tcf_mirred_replace_dev(struct tcf_mirred *m, struct net_device *ndev) { struct net_device *odev; odev = rcu_replace_pointer(m->tcfm_dev, ndev, lockdep_is_held(&m->tcf_lock)); netdev_put(odev, &m->tcfm_dev_tracker); } static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_mirred_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tcf_chain *goto_ch = NULL; bool mac_header_xmit = false; struct tc_mirred *parm; struct tcf_mirred *m; bool exists = false; int ret, err; u32 index; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed"); return -EINVAL; } ret = nla_parse_nested_deprecated(tb, TCA_MIRRED_MAX, nla, mirred_policy, extack); if (ret < 0) return ret; if (!tb[TCA_MIRRED_PARMS]) { NL_SET_ERR_MSG_MOD(extack, "Missing required mirred parameters"); return -EINVAL; } parm = nla_data(tb[TCA_MIRRED_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (tb[TCA_MIRRED_BLOCKID] && parm->ifindex) { NL_SET_ERR_MSG_MOD(extack, "Cannot specify Block ID and dev simultaneously"); if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } switch (parm->eaction) { case TCA_EGRESS_MIRROR: case TCA_EGRESS_REDIR: case TCA_INGRESS_REDIR: case TCA_INGRESS_MIRROR: break; default: if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option"); return -EINVAL; } if (!exists) { if (!parm->ifindex && !tb[TCA_MIRRED_BLOCKID]) { tcf_idr_cleanup(tn, index); NL_SET_ERR_MSG_MOD(extack, "Must specify device or block"); return -EINVAL; } ret = tcf_idr_create_from_flags(tn, index, est, a, &act_mirred_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } m = to_mirred(*a); if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&m->tcfm_list); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; spin_lock_bh(&m->tcf_lock); if (parm->ifindex) { struct net_device *ndev; ndev = dev_get_by_index(net, parm->ifindex); if (!ndev) { spin_unlock_bh(&m->tcf_lock); err = -ENODEV; goto put_chain; } mac_header_xmit = dev_is_mac_header_xmit(ndev); tcf_mirred_replace_dev(m, ndev); netdev_tracker_alloc(ndev, &m->tcfm_dev_tracker, GFP_ATOMIC); m->tcfm_mac_header_xmit = mac_header_xmit; m->tcfm_blockid = 0; } else if (tb[TCA_MIRRED_BLOCKID]) { tcf_mirred_replace_dev(m, NULL); m->tcfm_mac_header_xmit = false; m->tcfm_blockid = nla_get_u32(tb[TCA_MIRRED_BLOCKID]); } goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); m->tcfm_eaction = parm->eaction; spin_unlock_bh(&m->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (ret == ACT_P_CREATED) { spin_lock(&mirred_list_lock); list_add(&m->tcfm_list, &mirred_list); spin_unlock(&mirred_list_lock); } return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_mirred_forward(bool at_ingress, bool want_ingress, struct sk_buff *skb) { int err; if (!want_ingress) err = tcf_dev_queue_xmit(skb, dev_queue_xmit); else if (!at_ingress) err = netif_rx(skb); else err = netif_receive_skb(skb); return err; } static int tcf_mirred_to_dev(struct sk_buff *skb, struct tcf_mirred *m, struct net_device *dev, const bool m_mac_header_xmit, int m_eaction, int retval) { struct sk_buff *skb_to_send = skb; bool want_ingress; bool is_redirect; bool expects_nh; bool at_ingress; bool dont_clone; int mac_len; bool at_nh; int err; is_redirect = tcf_mirred_is_act_redirect(m_eaction); if (unlikely(!(dev->flags & IFF_UP)) || !netif_carrier_ok(dev)) { net_notice_ratelimited("tc mirred to Houston: device %s is down\n", dev->name); goto err_cant_do; } /* we could easily avoid the clone only if called by ingress and clsact; * since we can't easily detect the clsact caller, skip clone only for * ingress - that covers the TC S/W datapath. */ at_ingress = skb_at_tc_ingress(skb); dont_clone = skb_at_tc_ingress(skb) && is_redirect && tcf_mirred_can_reinsert(retval); if (!dont_clone) { skb_to_send = skb_clone(skb, GFP_ATOMIC); if (!skb_to_send) goto err_cant_do; } want_ingress = tcf_mirred_act_wants_ingress(m_eaction); /* All mirred/redirected skbs should clear previous ct info */ nf_reset_ct(skb_to_send); if (want_ingress && !at_ingress) /* drop dst for egress -> ingress */ skb_dst_drop(skb_to_send); expects_nh = want_ingress || !m_mac_header_xmit; at_nh = skb->data == skb_network_header(skb); if (at_nh != expects_nh) { mac_len = at_ingress ? skb->mac_len : skb_network_offset(skb); if (expects_nh) { /* target device/action expect data at nh */ skb_pull_rcsum(skb_to_send, mac_len); } else { /* target device/action expect data at mac */ skb_push_rcsum(skb_to_send, mac_len); } } skb_to_send->skb_iif = skb->dev->ifindex; skb_to_send->dev = dev; if (is_redirect) { if (skb == skb_to_send) retval = TC_ACT_CONSUMED; skb_set_redirected(skb_to_send, skb_to_send->tc_at_ingress); err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send); } else { err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send); } if (err) tcf_action_inc_overlimit_qstats(&m->common); return retval; err_cant_do: if (is_redirect) retval = TC_ACT_SHOT; tcf_action_inc_overlimit_qstats(&m->common); return retval; } static int tcf_blockcast_redir(struct sk_buff *skb, struct tcf_mirred *m, struct tcf_block *block, int m_eaction, const u32 exception_ifindex, int retval) { struct net_device *dev_prev = NULL; struct net_device *dev = NULL; unsigned long index; int mirred_eaction; mirred_eaction = tcf_mirred_act_wants_ingress(m_eaction) ? TCA_INGRESS_MIRROR : TCA_EGRESS_MIRROR; xa_for_each(&block->ports, index, dev) { if (index == exception_ifindex) continue; if (!dev_prev) goto assign_prev; tcf_mirred_to_dev(skb, m, dev_prev, dev_is_mac_header_xmit(dev), mirred_eaction, retval); assign_prev: dev_prev = dev; } if (dev_prev) return tcf_mirred_to_dev(skb, m, dev_prev, dev_is_mac_header_xmit(dev_prev), m_eaction, retval); return retval; } static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m, struct tcf_block *block, int m_eaction, const u32 exception_ifindex, int retval) { struct net_device *dev = NULL; unsigned long index; xa_for_each(&block->ports, index, dev) { if (index == exception_ifindex) continue; tcf_mirred_to_dev(skb, m, dev, dev_is_mac_header_xmit(dev), m_eaction, retval); } return retval; } static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m, const u32 blockid, struct tcf_result *res, int retval) { const u32 exception_ifindex = skb->dev->ifindex; struct tcf_block *block; bool is_redirect; int m_eaction; m_eaction = READ_ONCE(m->tcfm_eaction); is_redirect = tcf_mirred_is_act_redirect(m_eaction); /* we are already under rcu protection, so can call block lookup * directly. */ block = tcf_block_lookup(dev_net(skb->dev), blockid); if (!block || xa_empty(&block->ports)) { tcf_action_inc_overlimit_qstats(&m->common); return retval; } if (is_redirect) return tcf_blockcast_redir(skb, m, block, m_eaction, exception_ifindex, retval); /* If it's not redirect, it is mirror */ return tcf_blockcast_mirror(skb, m, block, m_eaction, exception_ifindex, retval); } TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_mirred *m = to_mirred(a); int retval = READ_ONCE(m->tcf_action); unsigned int nest_level; bool m_mac_header_xmit; struct net_device *dev; int m_eaction; u32 blockid; nest_level = __this_cpu_inc_return(mirred_nest_level); if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); retval = TC_ACT_SHOT; goto dec_nest_level; } tcf_lastuse_update(&m->tcf_tm); tcf_action_update_bstats(&m->common, skb); blockid = READ_ONCE(m->tcfm_blockid); if (blockid) { retval = tcf_blockcast(skb, m, blockid, res, retval); goto dec_nest_level; } dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); tcf_action_inc_overlimit_qstats(&m->common); goto dec_nest_level; } m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction, retval); dec_nest_level: __this_cpu_dec(mirred_nest_level); return retval; } static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_mirred *m = to_mirred(a); struct tcf_t *tm = &m->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_mirred *m = to_mirred(a); struct tc_mirred opt = { .index = m->tcf_index, .refcnt = refcount_read(&m->tcf_refcnt) - ref, .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, }; struct net_device *dev; struct tcf_t t; u32 blockid; spin_lock_bh(&m->tcf_lock); opt.action = m->tcf_action; opt.eaction = m->tcfm_eaction; dev = tcf_mirred_dev_dereference(m); if (dev) opt.ifindex = dev->ifindex; if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt)) goto nla_put_failure; blockid = m->tcfm_blockid; if (blockid && nla_put_u32(skb, TCA_MIRRED_BLOCKID, blockid)) goto nla_put_failure; tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; spin_unlock_bh(&m->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&m->tcf_lock); nlmsg_trim(skb, b); return -1; } static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; ASSERT_RTNL(); if (event == NETDEV_UNREGISTER) { spin_lock(&mirred_list_lock); list_for_each_entry(m, &mirred_list, tcfm_list) { spin_lock_bh(&m->tcf_lock); if (tcf_mirred_dev_dereference(m) == dev) { netdev_put(dev, &m->tcfm_dev_tracker); /* Note : no rcu grace period necessary, as * net_device are already rcu protected. */ RCU_INIT_POINTER(m->tcfm_dev, NULL); } spin_unlock_bh(&m->tcf_lock); } spin_unlock(&mirred_list_lock); } return NOTIFY_DONE; } static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; static void tcf_mirred_dev_put(void *priv) { struct net_device *dev = priv; dev_put(dev); } static struct net_device * tcf_mirred_get_dev(const struct tc_action *a, tc_action_priv_destructor *destructor) { struct tcf_mirred *m = to_mirred(a); struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(m->tcfm_dev); if (dev) { dev_hold(dev); *destructor = tcf_mirred_dev_put; } rcu_read_unlock(); return dev; } static size_t tcf_mirred_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_mirred)); } static void tcf_offload_mirred_get_dev(struct flow_action_entry *entry, const struct tc_action *act) { entry->dev = act->ops->get_dev(act, &entry->destructor); if (!entry->dev) return; entry->destructor_priv = entry->dev; } static int tcf_mirred_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; if (is_tcf_mirred_egress_redirect(act)) { entry->id = FLOW_ACTION_REDIRECT; tcf_offload_mirred_get_dev(entry, act); } else if (is_tcf_mirred_egress_mirror(act)) { entry->id = FLOW_ACTION_MIRRED; tcf_offload_mirred_get_dev(entry, act); } else if (is_tcf_mirred_ingress_redirect(act)) { entry->id = FLOW_ACTION_REDIRECT_INGRESS; tcf_offload_mirred_get_dev(entry, act); } else if (is_tcf_mirred_ingress_mirror(act)) { entry->id = FLOW_ACTION_MIRRED_INGRESS; tcf_offload_mirred_get_dev(entry, act); } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported mirred offload"); return -EOPNOTSUPP; } *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; if (is_tcf_mirred_egress_redirect(act)) fl_action->id = FLOW_ACTION_REDIRECT; else if (is_tcf_mirred_egress_mirror(act)) fl_action->id = FLOW_ACTION_MIRRED; else if (is_tcf_mirred_ingress_redirect(act)) fl_action->id = FLOW_ACTION_REDIRECT_INGRESS; else if (is_tcf_mirred_ingress_mirror(act)) fl_action->id = FLOW_ACTION_MIRRED_INGRESS; else return -EOPNOTSUPP; } return 0; } static struct tc_action_ops act_mirred_ops = { .kind = "mirred", .id = TCA_ID_MIRRED, .owner = THIS_MODULE, .act = tcf_mirred_act, .stats_update = tcf_stats_update, .dump = tcf_mirred_dump, .cleanup = tcf_mirred_release, .init = tcf_mirred_init, .get_fill_size = tcf_mirred_get_fill_size, .offload_act_setup = tcf_mirred_offload_act_setup, .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, }; MODULE_ALIAS_NET_ACT("mirred"); static __net_init int mirred_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_mirred_ops.net_id); return tc_action_net_init(net, tn, &act_mirred_ops); } static void __net_exit mirred_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_mirred_ops.net_id); } static struct pernet_operations mirred_net_ops = { .init = mirred_init_net, .exit_batch = mirred_exit_net, .id = &act_mirred_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002)"); MODULE_DESCRIPTION("Device Mirror/redirect actions"); MODULE_LICENSE("GPL"); static int __init mirred_init_module(void) { int err = register_netdevice_notifier(&mirred_device_notifier); if (err) return err; pr_info("Mirror/redirect action on\n"); err = tcf_register_action(&act_mirred_ops, &mirred_net_ops); if (err) unregister_netdevice_notifier(&mirred_device_notifier); return err; } static void __exit mirred_cleanup_module(void) { tcf_unregister_action(&act_mirred_ops, &mirred_net_ops); unregister_netdevice_notifier(&mirred_device_notifier); } module_init(mirred_init_module); module_exit(mirred_cleanup_module); |
2 1 1 5 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | // SPDX-License-Identifier: GPL-2.0 #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> #include <linux/xxhash.h> #include <linux/unaligned.h> #define XXHASH64_BLOCK_SIZE 32 #define XXHASH64_DIGEST_SIZE 8 struct xxhash64_tfm_ctx { u64 seed; }; struct xxhash64_desc_ctx { struct xxh64_state xxhstate; }; static int xxhash64_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct xxhash64_tfm_ctx *tctx = crypto_shash_ctx(tfm); if (keylen != sizeof(tctx->seed)) return -EINVAL; tctx->seed = get_unaligned_le64(key); return 0; } static int xxhash64_init(struct shash_desc *desc) { struct xxhash64_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct xxhash64_desc_ctx *dctx = shash_desc_ctx(desc); xxh64_reset(&dctx->xxhstate, tctx->seed); return 0; } static int xxhash64_update(struct shash_desc *desc, const u8 *data, unsigned int length) { struct xxhash64_desc_ctx *dctx = shash_desc_ctx(desc); xxh64_update(&dctx->xxhstate, data, length); return 0; } static int xxhash64_final(struct shash_desc *desc, u8 *out) { struct xxhash64_desc_ctx *dctx = shash_desc_ctx(desc); put_unaligned_le64(xxh64_digest(&dctx->xxhstate), out); return 0; } static int xxhash64_digest(struct shash_desc *desc, const u8 *data, unsigned int length, u8 *out) { struct xxhash64_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); put_unaligned_le64(xxh64(data, length, tctx->seed), out); return 0; } static struct shash_alg alg = { .digestsize = XXHASH64_DIGEST_SIZE, .setkey = xxhash64_setkey, .init = xxhash64_init, .update = xxhash64_update, .final = xxhash64_final, .digest = xxhash64_digest, .descsize = sizeof(struct xxhash64_desc_ctx), .base = { .cra_name = "xxhash64", .cra_driver_name = "xxhash64-generic", .cra_priority = 100, .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, .cra_blocksize = XXHASH64_BLOCK_SIZE, .cra_ctxsize = sizeof(struct xxhash64_tfm_ctx), .cra_module = THIS_MODULE, } }; static int __init xxhash_mod_init(void) { return crypto_register_shash(&alg); } static void __exit xxhash_mod_fini(void) { crypto_unregister_shash(&alg); } subsys_initcall(xxhash_mod_init); module_exit(xxhash_mod_fini); MODULE_AUTHOR("Nikolay Borisov <nborisov@suse.com>"); MODULE_DESCRIPTION("xxhash calculations wrapper for lib/xxhash.c"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("xxhash64"); MODULE_ALIAS_CRYPTO("xxhash64-generic"); |
1 1 1 1 1 634 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 | // SPDX-License-Identifier: GPL-2.0 /* * Dynamic byte queue limits. See include/linux/dynamic_queue_limits.h * * Copyright (c) 2011, Tom Herbert <therbert@google.com> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/dynamic_queue_limits.h> #include <linux/compiler.h> #include <linux/export.h> #include <trace/events/napi.h> #define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0) #define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0) static void dql_check_stall(struct dql *dql, unsigned short stall_thrs) { unsigned long now; if (!stall_thrs) return; now = jiffies; /* Check for a potential stall */ if (time_after_eq(now, dql->last_reap + stall_thrs)) { unsigned long hist_head, t, start, end; /* We are trying to detect a period of at least @stall_thrs * jiffies without any Tx completions, but during first half * of which some Tx was posted. */ dqs_again: hist_head = READ_ONCE(dql->history_head); /* pairs with smp_wmb() in dql_queued() */ smp_rmb(); /* Get the previous entry in the ring buffer, which is the * oldest sample. */ start = (hist_head - DQL_HIST_LEN + 1) * BITS_PER_LONG; /* Advance start to continue from the last reap time */ if (time_before(start, dql->last_reap + 1)) start = dql->last_reap + 1; /* Newest sample we should have already seen a completion for */ end = hist_head * BITS_PER_LONG + (BITS_PER_LONG - 1); /* Shrink the search space to [start, (now - start_thrs/2)] if * `end` is beyond the stall zone */ if (time_before(now, end + stall_thrs / 2)) end = now - stall_thrs / 2; /* Search for the queued time in [t, end] */ for (t = start; time_before_eq(t, end); t++) if (test_bit(t % (DQL_HIST_LEN * BITS_PER_LONG), dql->history)) break; /* Variable t contains the time of the queue */ if (!time_before_eq(t, end)) goto no_stall; /* The ring buffer was modified in the meantime, retry */ if (hist_head != READ_ONCE(dql->history_head)) goto dqs_again; dql->stall_cnt++; dql->stall_max = max_t(unsigned short, dql->stall_max, now - t); trace_dql_stall_detected(dql->stall_thrs, now - t, dql->last_reap, dql->history_head, now, dql->history); } no_stall: dql->last_reap = now; } /* Records completed count and recalculates the queue limit */ void dql_completed(struct dql *dql, unsigned int count) { unsigned int inprogress, prev_inprogress, limit; unsigned int ovlimit, completed, num_queued; unsigned short stall_thrs; bool all_prev_completed; num_queued = READ_ONCE(dql->num_queued); /* Read stall_thrs in advance since it belongs to the same (first) * cache line as ->num_queued. This way, dql_check_stall() does not * need to touch the first cache line again later, reducing the window * of possible false sharing. */ stall_thrs = READ_ONCE(dql->stall_thrs); /* Can't complete more than what's in queue */ BUG_ON(count > num_queued - dql->num_completed); completed = dql->num_completed + count; limit = dql->limit; ovlimit = POSDIFF(num_queued - dql->num_completed, limit); inprogress = num_queued - completed; prev_inprogress = dql->prev_num_queued - dql->num_completed; all_prev_completed = AFTER_EQ(completed, dql->prev_num_queued); if ((ovlimit && !inprogress) || (dql->prev_ovlimit && all_prev_completed)) { /* * Queue considered starved if: * - The queue was over-limit in the last interval, * and there is no more data in the queue. * OR * - The queue was over-limit in the previous interval and * when enqueuing it was possible that all queued data * had been consumed. This covers the case when queue * may have becomes starved between completion processing * running and next time enqueue was scheduled. * * When queue is starved increase the limit by the amount * of bytes both sent and completed in the last interval, * plus any previous over-limit. */ limit += POSDIFF(completed, dql->prev_num_queued) + dql->prev_ovlimit; dql->slack_start_time = jiffies; dql->lowest_slack = UINT_MAX; } else if (inprogress && prev_inprogress && !all_prev_completed) { /* * Queue was not starved, check if the limit can be decreased. * A decrease is only considered if the queue has been busy in * the whole interval (the check above). * * If there is slack, the amount of excess data queued above * the amount needed to prevent starvation, the queue limit * can be decreased. To avoid hysteresis we consider the * minimum amount of slack found over several iterations of the * completion routine. */ unsigned int slack, slack_last_objs; /* * Slack is the maximum of * - The queue limit plus previous over-limit minus twice * the number of objects completed. Note that two times * number of completed bytes is a basis for an upper bound * of the limit. * - Portion of objects in the last queuing operation that * was not part of non-zero previous over-limit. That is * "round down" by non-overlimit portion of the last * queueing operation. */ slack = POSDIFF(limit + dql->prev_ovlimit, 2 * (completed - dql->num_completed)); slack_last_objs = dql->prev_ovlimit ? POSDIFF(dql->prev_last_obj_cnt, dql->prev_ovlimit) : 0; slack = max(slack, slack_last_objs); if (slack < dql->lowest_slack) dql->lowest_slack = slack; if (time_after(jiffies, dql->slack_start_time + dql->slack_hold_time)) { limit = POSDIFF(limit, dql->lowest_slack); dql->slack_start_time = jiffies; dql->lowest_slack = UINT_MAX; } } /* Enforce bounds on limit */ limit = clamp(limit, dql->min_limit, dql->max_limit); if (limit != dql->limit) { dql->limit = limit; ovlimit = 0; } dql->adj_limit = limit + completed; dql->prev_ovlimit = ovlimit; dql->prev_last_obj_cnt = dql->last_obj_cnt; dql->num_completed = completed; dql->prev_num_queued = num_queued; dql_check_stall(dql, stall_thrs); } EXPORT_SYMBOL(dql_completed); void dql_reset(struct dql *dql) { /* Reset all dynamic values */ dql->limit = 0; dql->num_queued = 0; dql->num_completed = 0; dql->last_obj_cnt = 0; dql->prev_num_queued = 0; dql->prev_last_obj_cnt = 0; dql->prev_ovlimit = 0; dql->lowest_slack = UINT_MAX; dql->slack_start_time = jiffies; dql->last_reap = jiffies; dql->history_head = jiffies / BITS_PER_LONG; memset(dql->history, 0, sizeof(dql->history)); } EXPORT_SYMBOL(dql_reset); void dql_init(struct dql *dql, unsigned int hold_time) { dql->max_limit = DQL_MAX_LIMIT; dql->min_limit = 0; dql->slack_hold_time = hold_time; dql->stall_thrs = 0; dql_reset(dql); } EXPORT_SYMBOL(dql_init); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Routines to manage notifier chains for passing status changes to any * interested routines. We need this instead of hard coded call lists so * that modules can poke their nose into the innards. The network devices * needed them so here they are for the rest of you. * * Alan Cox <Alan.Cox@linux.org> */ #ifndef _LINUX_NOTIFIER_H #define _LINUX_NOTIFIER_H #include <linux/errno.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/srcu.h> /* * Notifier chains are of four types: * * Atomic notifier chains: Chain callbacks run in interrupt/atomic * context. Callouts are not allowed to block. * Blocking notifier chains: Chain callbacks run in process context. * Callouts are allowed to block. * Raw notifier chains: There are no restrictions on callbacks, * registration, or unregistration. All locking and protection * must be provided by the caller. * SRCU notifier chains: A variant of blocking notifier chains, with * the same restrictions. * * atomic_notifier_chain_register() may be called from an atomic context, * but blocking_notifier_chain_register() and srcu_notifier_chain_register() * must be called from a process context. Ditto for the corresponding * _unregister() routines. * * atomic_notifier_chain_unregister(), blocking_notifier_chain_unregister(), * and srcu_notifier_chain_unregister() _must not_ be called from within * the call chain. * * SRCU notifier chains are an alternative form of blocking notifier chains. * They use SRCU (Sleepable Read-Copy Update) instead of rw-semaphores for * protection of the chain links. This means there is _very_ low overhead * in srcu_notifier_call_chain(): no cache bounces and no memory barriers. * As compensation, srcu_notifier_chain_unregister() is rather expensive. * SRCU notifier chains should be used when the chain will be called very * often but notifier_blocks will seldom be removed. */ struct notifier_block; typedef int (*notifier_fn_t)(struct notifier_block *nb, unsigned long action, void *data); struct notifier_block { notifier_fn_t notifier_call; struct notifier_block __rcu *next; int priority; }; struct atomic_notifier_head { spinlock_t lock; struct notifier_block __rcu *head; }; struct blocking_notifier_head { struct rw_semaphore rwsem; struct notifier_block __rcu *head; }; struct raw_notifier_head { struct notifier_block __rcu *head; }; struct srcu_notifier_head { struct mutex mutex; struct srcu_usage srcuu; struct srcu_struct srcu; struct notifier_block __rcu *head; }; #define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ spin_lock_init(&(name)->lock); \ (name)->head = NULL; \ } while (0) #define BLOCKING_INIT_NOTIFIER_HEAD(name) do { \ init_rwsem(&(name)->rwsem); \ (name)->head = NULL; \ } while (0) #define RAW_INIT_NOTIFIER_HEAD(name) do { \ (name)->head = NULL; \ } while (0) /* srcu_notifier_heads must be cleaned up dynamically */ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); #define srcu_cleanup_notifier_head(name) \ cleanup_srcu_struct(&(name)->srcu); #define ATOMIC_NOTIFIER_INIT(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .head = NULL } #define BLOCKING_NOTIFIER_INIT(name) { \ .rwsem = __RWSEM_INITIALIZER((name).rwsem), \ .head = NULL } #define RAW_NOTIFIER_INIT(name) { \ .head = NULL } #define SRCU_NOTIFIER_INIT(name, pcpu) \ { \ .mutex = __MUTEX_INITIALIZER(name.mutex), \ .head = NULL, \ .srcuu = __SRCU_USAGE_INIT(name.srcuu), \ .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ } #define ATOMIC_NOTIFIER_HEAD(name) \ struct atomic_notifier_head name = \ ATOMIC_NOTIFIER_INIT(name) #define BLOCKING_NOTIFIER_HEAD(name) \ struct blocking_notifier_head name = \ BLOCKING_NOTIFIER_INIT(name) #define RAW_NOTIFIER_HEAD(name) \ struct raw_notifier_head name = \ RAW_NOTIFIER_INIT(name) #ifdef CONFIG_TREE_SRCU #define _SRCU_NOTIFIER_HEAD(name, mod) \ static DEFINE_PER_CPU(struct srcu_data, name##_head_srcu_data); \ mod struct srcu_notifier_head name = \ SRCU_NOTIFIER_INIT(name, name##_head_srcu_data) #else #define _SRCU_NOTIFIER_HEAD(name, mod) \ mod struct srcu_notifier_head name = \ SRCU_NOTIFIER_INIT(name, name) #endif #define SRCU_NOTIFIER_HEAD(name) \ _SRCU_NOTIFIER_HEAD(name, /* not static */) #define SRCU_NOTIFIER_HEAD_STATIC(name) \ _SRCU_NOTIFIER_HEAD(name, static) #ifdef __KERNEL__ extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *nb); extern int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *nb); extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *nb); extern int atomic_notifier_chain_register_unique_prio( struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_register_unique_prio( struct blocking_notifier_head *nh, struct notifier_block *nb); extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, struct notifier_block *nb); extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh, struct notifier_block *nb); extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, struct notifier_block *nb); extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v); extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v); extern int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v); extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v); extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); extern bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh); #define NOTIFY_DONE 0x0000 /* Don't care */ #define NOTIFY_OK 0x0001 /* Suits me */ #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ #define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) /* Bad/Veto action */ /* * Clean way to return from the notifier and stop further calls. */ #define NOTIFY_STOP (NOTIFY_OK|NOTIFY_STOP_MASK) /* Encapsulate (negative) errno value (in particular, NOTIFY_BAD <=> EPERM). */ static inline int notifier_from_errno(int err) { if (err) return NOTIFY_STOP_MASK | (NOTIFY_OK - err); return NOTIFY_OK; } /* Restore (negative) errno value from notify return value. */ static inline int notifier_to_errno(int ret) { ret &= ~NOTIFY_STOP_MASK; return ret > NOTIFY_OK ? NOTIFY_OK - ret : 0; } /* * Declared notifiers so far. I can imagine quite a few more chains * over time (eg laptop power reset chains, reboot chain (to clean * device units up), device [un]mount chain, module load/unload chain, * low memory chain, screenblank chain (for plug in modular screenblankers) * VC switch chains (for loadable kernel svgalib VC switch helpers) etc... */ /* CPU notfiers are defined in include/linux/cpu.h. */ /* netdevice notifiers are defined in include/linux/netdevice.h */ /* reboot notifiers are defined in include/linux/reboot.h. */ /* Hibernation and suspend events are defined in include/linux/suspend.h. */ /* Virtual Terminal events are defined in include/linux/vt.h. */ #define NETLINK_URELEASE 0x0001 /* Unicast netlink socket released */ /* Console keyboard events. * Note: KBD_KEYCODE is always sent before KBD_UNBOUND_KEYCODE, KBD_UNICODE and * KBD_KEYSYM. */ #define KBD_KEYCODE 0x0001 /* Keyboard keycode, called before any other */ #define KBD_UNBOUND_KEYCODE 0x0002 /* Keyboard keycode which is not bound to any other */ #define KBD_UNICODE 0x0003 /* Keyboard unicode */ #define KBD_KEYSYM 0x0004 /* Keyboard keysym */ #define KBD_POST_KEYSYM 0x0005 /* Called after keyboard keysym interpretation */ extern struct blocking_notifier_head reboot_notifier_list; #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */ |
845 845 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright 2011-2014 Autronica Fire and Security AS * * 2011-2014 Arvid Brodin, arvid.brodin@alten.se * * include file for HSR and PRP. */ #ifndef __HSR_SLAVE_H #define __HSR_SLAVE_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include "hsr_main.h" int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, enum hsr_port_type pt, struct netlink_ext_ack *extack); void hsr_del_port(struct hsr_port *port); bool hsr_port_exists(const struct net_device *dev); static inline struct hsr_port *hsr_port_get_rtnl(const struct net_device *dev) { ASSERT_RTNL(); return hsr_port_exists(dev) ? rtnl_dereference(dev->rx_handler_data) : NULL; } static inline struct hsr_port *hsr_port_get_rcu(const struct net_device *dev) { return hsr_port_exists(dev) ? rcu_dereference(dev->rx_handler_data) : NULL; } bool hsr_invalid_dan_ingress_frame(__be16 protocol); #endif /* __HSR_SLAVE_H */ |
6 6 6 5 3 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | // SPDX-License-Identifier: GPL-2.0-or-later /* Key permission checking * * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/export.h> #include <linux/security.h> #include "internal.h" /** * key_task_permission - Check a key can be used * @key_ref: The key to check. * @cred: The credentials to use. * @need_perm: The permission required. * * Check to see whether permission is granted to use a key in the desired way, * but permit the security modules to override. * * The caller must hold either a ref on cred or must hold the RCU readlock. * * Returns 0 if successful, -EACCES if access is denied based on the * permissions bits or the LSM check. */ int key_task_permission(const key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { struct key *key; key_perm_t kperm, mask; int ret; switch (need_perm) { default: WARN_ON(1); return -EACCES; case KEY_NEED_UNLINK: case KEY_SYSADMIN_OVERRIDE: case KEY_AUTHTOKEN_OVERRIDE: case KEY_DEFER_PERM_CHECK: goto lsm; case KEY_NEED_VIEW: mask = KEY_OTH_VIEW; break; case KEY_NEED_READ: mask = KEY_OTH_READ; break; case KEY_NEED_WRITE: mask = KEY_OTH_WRITE; break; case KEY_NEED_SEARCH: mask = KEY_OTH_SEARCH; break; case KEY_NEED_LINK: mask = KEY_OTH_LINK; break; case KEY_NEED_SETATTR: mask = KEY_OTH_SETATTR; break; } key = key_ref_to_ptr(key_ref); /* use the second 8-bits of permissions for keys the caller owns */ if (uid_eq(key->uid, cred->fsuid)) { kperm = key->perm >> 16; goto use_these_perms; } /* use the third 8-bits of permissions for keys the caller has a group * membership in common with */ if (gid_valid(key->gid) && key->perm & KEY_GRP_ALL) { if (gid_eq(key->gid, cred->fsgid)) { kperm = key->perm >> 8; goto use_these_perms; } ret = groups_search(cred->group_info, key->gid); if (ret) { kperm = key->perm >> 8; goto use_these_perms; } } /* otherwise use the least-significant 8-bits */ kperm = key->perm; use_these_perms: /* use the top 8-bits of permissions for keys the caller possesses * - possessor permissions are additive with other permissions */ if (is_key_possessed(key_ref)) kperm |= key->perm >> 24; if ((kperm & mask) != mask) return -EACCES; /* let LSM be the final arbiter */ lsm: return security_key_permission(key_ref, cred, need_perm); } EXPORT_SYMBOL(key_task_permission); /** * key_validate - Validate a key. * @key: The key to be validated. * * Check that a key is valid, returning 0 if the key is okay, -ENOKEY if the * key is invalidated, -EKEYREVOKED if the key's type has been removed or if * the key has been revoked or -EKEYEXPIRED if the key has expired. */ int key_validate(const struct key *key) { unsigned long flags = READ_ONCE(key->flags); time64_t expiry = READ_ONCE(key->expiry); if (flags & (1 << KEY_FLAG_INVALIDATED)) return -ENOKEY; /* check it's still accessible */ if (flags & ((1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_DEAD))) return -EKEYREVOKED; /* check it hasn't expired */ if (expiry) { if (ktime_get_real_seconds() >= expiry) return -EKEYEXPIRED; } return 0; } EXPORT_SYMBOL(key_validate); |
2 2 2 9 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 | /* SPDX-License-Identifier: GPL-2.0 */ /* XDP user-space ring structure * Copyright(c) 2018 Intel Corporation. */ #ifndef _LINUX_XSK_QUEUE_H #define _LINUX_XSK_QUEUE_H #include <linux/types.h> #include <linux/if_xdp.h> #include <net/xdp_sock.h> #include <net/xsk_buff_pool.h> #include "xsk.h" struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; /* Hinder the adjacent cache prefetcher to prefetch the consumer * pointer if the producer pointer is touched and vice versa. */ u32 pad1 ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; u32 pad2 ____cacheline_aligned_in_smp; u32 flags; u32 pad3 ____cacheline_aligned_in_smp; }; /* Used for the RX and TX queues for packets */ struct xdp_rxtx_ring { struct xdp_ring ptrs; struct xdp_desc desc[] ____cacheline_aligned_in_smp; }; /* Used for the fill and completion queues for buffers */ struct xdp_umem_ring { struct xdp_ring ptrs; u64 desc[] ____cacheline_aligned_in_smp; }; struct xsk_queue { u32 ring_mask; u32 nentries; u32 cached_prod; u32 cached_cons; struct xdp_ring *ring; u64 invalid_descs; u64 queue_empty_descs; size_t ring_vmalloc_size; }; struct parsed_desc { u32 mb; u32 valid; }; /* The structure of the shared state of the rings are a simple * circular buffer, as outlined in * Documentation/core-api/circular-buffers.rst. For the Rx and * completion ring, the kernel is the producer and user space is the * consumer. For the Tx and fill rings, the kernel is the consumer and * user space is the producer. * * producer consumer * * if (LOAD ->consumer) { (A) LOAD.acq ->producer (C) * STORE $data LOAD $data * STORE.rel ->producer (B) STORE.rel ->consumer (D) * } * * (A) pairs with (D), and (B) pairs with (C). * * Starting with (B), it protects the data from being written after * the producer pointer. If this barrier was missing, the consumer * could observe the producer pointer being set and thus load the data * before the producer has written the new data. The consumer would in * this case load the old data. * * (C) protects the consumer from speculatively loading the data before * the producer pointer actually has been read. If we do not have this * barrier, some architectures could load old data as speculative loads * are not discarded as the CPU does not know there is a dependency * between ->producer and data. * * (A) is a control dependency that separates the load of ->consumer * from the stores of $data. In case ->consumer indicates there is no * room in the buffer to store $data we do not. The dependency will * order both of the stores after the loads. So no barrier is needed. * * (D) protects the load of the data to be observed to happen after the * store of the consumer pointer. If we did not have this memory * barrier, the producer could observe the consumer pointer being set * and overwrite the data with a new value before the consumer got the * chance to read the old value. The consumer would thus miss reading * the old entry and very likely read the new entry twice, once right * now and again after circling through the ring. */ /* The operations on the rings are the following: * * producer consumer * * RESERVE entries PEEK in the ring for entries * WRITE data into the ring READ data from the ring * SUBMIT entries RELEASE entries * * The producer reserves one or more entries in the ring. It can then * fill in these entries and finally submit them so that they can be * seen and read by the consumer. * * The consumer peeks into the ring to see if the producer has written * any new entries. If so, the consumer can then read these entries * and when it is done reading them release them back to the producer * so that the producer can use these slots to fill in new entries. * * The function names below reflect these operations. */ /* Functions that read and validate content from consumer rings. */ static inline void __xskq_cons_read_addr_unchecked(struct xsk_queue *q, u32 cached_cons, u64 *addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; *addr = ring->desc[idx]; } static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_cons != q->cached_prod) { __xskq_cons_read_addr_unchecked(q, q->cached_cons, addr); return true; } return false; } static inline bool xp_unused_options_set(u32 options) { return options & ~(XDP_PKT_CONTD | XDP_TX_METADATA); } static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { u64 addr = desc->addr - pool->tx_metadata_len; u64 len = desc->len + pool->tx_metadata_len; u64 offset = addr & (pool->chunk_size - 1); if (!desc->len) return false; if (offset + len > pool->chunk_size) return false; if (addr >= pool->addrs_cnt) return false; if (xp_unused_options_set(desc->options)) return false; return true; } static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len; u64 len = desc->len + pool->tx_metadata_len; if (!desc->len) return false; if (len > pool->chunk_size) return false; if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt || xp_desc_crosses_non_contig_pg(pool, addr, len)) return false; if (xp_unused_options_set(desc->options)) return false; return true; } static inline bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : xp_aligned_validate_desc(pool, desc); } static inline bool xskq_has_descs(struct xsk_queue *q) { return q->cached_cons != q->cached_prod; } static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xsk_buff_pool *pool) { if (!xp_validate_desc(pool, d)) { q->invalid_descs++; return false; } return true; } static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { if (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; *desc = ring->desc[idx]; return xskq_cons_is_valid_desc(q, desc, pool); } q->queue_empty_descs++; return false; } static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) { q->cached_cons += cnt; } static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool, struct xdp_desc *desc, struct parsed_desc *parsed) { parsed->valid = xskq_cons_is_valid_desc(q, desc, pool); parsed->mb = xp_mb_desc(desc); } static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, u32 max) { u32 cached_cons = q->cached_cons, nb_entries = 0; struct xdp_desc *descs = pool->tx_descs; u32 total_descs = 0, nr_frags = 0; /* track first entry, if stumble upon *any* invalid descriptor, rewind * current packet that consists of frags and stop the processing */ while (cached_cons != q->cached_prod && nb_entries < max) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; struct parsed_desc parsed; descs[nb_entries] = ring->desc[idx]; cached_cons++; parse_desc(q, pool, &descs[nb_entries], &parsed); if (unlikely(!parsed.valid)) break; if (likely(!parsed.mb)) { total_descs += (nr_frags + 1); nr_frags = 0; } else { nr_frags++; if (nr_frags == pool->netdev->xdp_zc_max_segs) { nr_frags = 0; break; } } nb_entries++; } cached_cons -= nr_frags; /* Release valid plus any invalid entries */ xskq_cons_release_n(q, cached_cons - q->cached_cons); return total_descs; } /* Functions for consumers */ static inline void __xskq_cons_release(struct xsk_queue *q) { smp_store_release(&q->ring->consumer, q->cached_cons); /* D, matchees A */ } static inline void __xskq_cons_peek(struct xsk_queue *q) { /* Refresh the local pointer */ q->cached_prod = smp_load_acquire(&q->ring->producer); /* C, matches B */ } static inline void xskq_cons_get_entries(struct xsk_queue *q) { __xskq_cons_release(q); __xskq_cons_peek(q); } static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) { u32 entries = q->cached_prod - q->cached_cons; if (entries >= max) return max; __xskq_cons_peek(q); entries = q->cached_prod - q->cached_cons; return entries >= max ? max : entries; } static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); return xskq_cons_read_addr_unchecked(q, addr); } static inline bool xskq_cons_peek_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); return xskq_cons_read_desc(q, desc, pool); } /* To improve performance in the xskq_cons_release functions, only update local state here. * Reflect this to global state when we get new entries from the ring in * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. */ static inline void xskq_cons_release(struct xsk_queue *q) { q->cached_cons++; } static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt) { q->cached_cons -= cnt; } static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ return READ_ONCE(q->ring->producer) - READ_ONCE(q->ring->consumer); } /* Functions for producers */ static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); if (free_entries >= max) return max; /* Refresh the local tail pointer */ q->cached_cons = READ_ONCE(q->ring->consumer); free_entries = q->nentries - (q->cached_prod - q->cached_cons); return free_entries >= max ? max : free_entries; } static inline bool xskq_prod_is_full(struct xsk_queue *q) { return xskq_prod_nb_free(q, 1) ? false : true; } static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) { q->cached_prod -= cnt; } static inline int xskq_prod_reserve(struct xsk_queue *q) { if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ q->cached_prod++; return 0; } static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; if (xskq_prod_is_full(q)) return -ENOSPC; /* A, matches D */ ring->desc[q->cached_prod++ & q->ring_mask] = addr; return 0; } static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, u32 nb_entries) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; u32 i, cached_prod; /* A, matches D */ cached_prod = q->cached_prod; for (i = 0; i < nb_entries; i++) ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; q->cached_prod = cached_prod; } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len, u32 flags) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx; if (xskq_prod_is_full(q)) return -ENOBUFS; /* A, matches D */ idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; ring->desc[idx].options = flags; return 0; } static inline void __xskq_prod_submit(struct xsk_queue *q, u32 idx) { smp_store_release(&q->ring->producer, idx); /* B, matches C */ } static inline void xskq_prod_submit(struct xsk_queue *q) { __xskq_prod_submit(q, q->cached_prod); } static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) { __xskq_prod_submit(q, q->ring->producer + nb_entries); } static inline bool xskq_prod_is_empty(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ return READ_ONCE(q->ring->consumer) == READ_ONCE(q->ring->producer); } /* For both producers and consumers */ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) { return q ? q->invalid_descs : 0; } static inline u64 xskq_nb_queue_empty_descs(struct xsk_queue *q) { return q ? q->queue_empty_descs : 0; } struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); #endif /* _LINUX_XSK_QUEUE_H */ |
1776 1776 6 1772 71 71 1431 1430 1424 503 4196 158 158 49 152 127 103 103 149 150 648 69 2 68 2 454 2 445 98 159 17 654 648 261 22 21 235 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/bitmap.h> #include <linux/bug.h> #include <linux/export.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/xarray.h> /** * idr_alloc_u32() - Allocate an ID. * @idr: IDR handle. * @ptr: Pointer to be associated with the new ID. * @nextid: Pointer to an ID. * @max: The maximum ID to allocate (inclusive). * @gfp: Memory allocation flags. * * Allocates an unused ID in the range specified by @nextid and @max. * Note that @max is inclusive whereas the @end parameter to idr_alloc() * is exclusive. The new ID is assigned to @nextid before the pointer * is inserted into the IDR, so if @nextid points into the object pointed * to by @ptr, a concurrent lookup will not find an uninitialised ID. * * The caller should provide their own locking to ensure that two * concurrent modifications to the IDR are not possible. Read-only * accesses to the IDR may be done under the RCU read lock or may * exclude simultaneous writers. * * Return: 0 if an ID was allocated, -ENOMEM if memory allocation failed, * or -ENOSPC if no free IDs could be found. If an error occurred, * @nextid is unchanged. */ int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid, unsigned long max, gfp_t gfp) { struct radix_tree_iter iter; void __rcu **slot; unsigned int base = idr->idr_base; unsigned int id = *nextid; if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR))) idr->idr_rt.xa_flags |= IDR_RT_MARKER; id = (id < base) ? 0 : id - base; radix_tree_iter_init(&iter, id); slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base); if (IS_ERR(slot)) return PTR_ERR(slot); *nextid = iter.index + base; /* there is a memory barrier inside radix_tree_iter_replace() */ radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr); radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE); return 0; } EXPORT_SYMBOL_GPL(idr_alloc_u32); /** * idr_alloc() - Allocate an ID. * @idr: IDR handle. * @ptr: Pointer to be associated with the new ID. * @start: The minimum ID (inclusive). * @end: The maximum ID (exclusive). * @gfp: Memory allocation flags. * * Allocates an unused ID in the range specified by @start and @end. If * @end is <= 0, it is treated as one larger than %INT_MAX. This allows * callers to use @start + N as @end as long as N is within integer range. * * The caller should provide their own locking to ensure that two * concurrent modifications to the IDR are not possible. Read-only * accesses to the IDR may be done under the RCU read lock or may * exclude simultaneous writers. * * Return: The newly allocated ID, -ENOMEM if memory allocation failed, * or -ENOSPC if no free IDs could be found. */ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp) { u32 id = start; int ret; if (WARN_ON_ONCE(start < 0)) return -EINVAL; ret = idr_alloc_u32(idr, ptr, &id, end > 0 ? end - 1 : INT_MAX, gfp); if (ret) return ret; return id; } EXPORT_SYMBOL_GPL(idr_alloc); /** * idr_alloc_cyclic() - Allocate an ID cyclically. * @idr: IDR handle. * @ptr: Pointer to be associated with the new ID. * @start: The minimum ID (inclusive). * @end: The maximum ID (exclusive). * @gfp: Memory allocation flags. * * Allocates an unused ID in the range specified by @start and @end. If * @end is <= 0, it is treated as one larger than %INT_MAX. This allows * callers to use @start + N as @end as long as N is within integer range. * The search for an unused ID will start at the last ID allocated and will * wrap around to @start if no free IDs are found before reaching @end. * * The caller should provide their own locking to ensure that two * concurrent modifications to the IDR are not possible. Read-only * accesses to the IDR may be done under the RCU read lock or may * exclude simultaneous writers. * * Return: The newly allocated ID, -ENOMEM if memory allocation failed, * or -ENOSPC if no free IDs could be found. */ int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp) { u32 id = idr->idr_next; int err, max = end > 0 ? end - 1 : INT_MAX; if ((int)id < start) id = start; err = idr_alloc_u32(idr, ptr, &id, max, gfp); if ((err == -ENOSPC) && (id > start)) { id = start; err = idr_alloc_u32(idr, ptr, &id, max, gfp); } if (err) return err; idr->idr_next = id + 1; return id; } EXPORT_SYMBOL(idr_alloc_cyclic); /** * idr_remove() - Remove an ID from the IDR. * @idr: IDR handle. * @id: Pointer ID. * * Removes this ID from the IDR. If the ID was not previously in the IDR, * this function returns %NULL. * * Since this function modifies the IDR, the caller should provide their * own locking to ensure that concurrent modification of the same IDR is * not possible. * * Return: The pointer formerly associated with this ID. */ void *idr_remove(struct idr *idr, unsigned long id) { return radix_tree_delete_item(&idr->idr_rt, id - idr->idr_base, NULL); } EXPORT_SYMBOL_GPL(idr_remove); /** * idr_find() - Return pointer for given ID. * @idr: IDR handle. * @id: Pointer ID. * * Looks up the pointer associated with this ID. A %NULL pointer may * indicate that @id is not allocated or that the %NULL pointer was * associated with this ID. * * This function can be called under rcu_read_lock(), given that the leaf * pointers lifetimes are correctly managed. * * Return: The pointer associated with this ID. */ void *idr_find(const struct idr *idr, unsigned long id) { return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base); } EXPORT_SYMBOL_GPL(idr_find); /** * idr_for_each() - Iterate through all stored pointers. * @idr: IDR handle. * @fn: Function to be called for each pointer. * @data: Data passed to callback function. * * The callback function will be called for each entry in @idr, passing * the ID, the entry and @data. * * If @fn returns anything other than %0, the iteration stops and that * value is returned from this function. * * idr_for_each() can be called concurrently with idr_alloc() and * idr_remove() if protected by RCU. Newly added entries may not be * seen and deleted entries may be seen, but adding and removing entries * will not cause other entries to be skipped, nor spurious ones to be seen. */ int idr_for_each(const struct idr *idr, int (*fn)(int id, void *p, void *data), void *data) { struct radix_tree_iter iter; void __rcu **slot; int base = idr->idr_base; radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) { int ret; unsigned long id = iter.index + base; if (WARN_ON_ONCE(id > INT_MAX)) break; ret = fn(id, rcu_dereference_raw(*slot), data); if (ret) return ret; } return 0; } EXPORT_SYMBOL(idr_for_each); /** * idr_get_next_ul() - Find next populated entry. * @idr: IDR handle. * @nextid: Pointer to an ID. * * Returns the next populated entry in the tree with an ID greater than * or equal to the value pointed to by @nextid. On exit, @nextid is updated * to the ID of the found value. To use in a loop, the value pointed to by * nextid must be incremented by the user. */ void *idr_get_next_ul(struct idr *idr, unsigned long *nextid) { struct radix_tree_iter iter; void __rcu **slot; void *entry = NULL; unsigned long base = idr->idr_base; unsigned long id = *nextid; id = (id < base) ? 0 : id - base; radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, id) { entry = rcu_dereference_raw(*slot); if (!entry) continue; if (!xa_is_internal(entry)) break; if (slot != &idr->idr_rt.xa_head && !xa_is_retry(entry)) break; slot = radix_tree_iter_retry(&iter); } if (!slot) return NULL; *nextid = iter.index + base; return entry; } EXPORT_SYMBOL(idr_get_next_ul); /** * idr_get_next() - Find next populated entry. * @idr: IDR handle. * @nextid: Pointer to an ID. * * Returns the next populated entry in the tree with an ID greater than * or equal to the value pointed to by @nextid. On exit, @nextid is updated * to the ID of the found value. To use in a loop, the value pointed to by * nextid must be incremented by the user. */ void *idr_get_next(struct idr *idr, int *nextid) { unsigned long id = *nextid; void *entry = idr_get_next_ul(idr, &id); if (WARN_ON_ONCE(id > INT_MAX)) return NULL; *nextid = id; return entry; } EXPORT_SYMBOL(idr_get_next); /** * idr_replace() - replace pointer for given ID. * @idr: IDR handle. * @ptr: New pointer to associate with the ID. * @id: ID to change. * * Replace the pointer registered with an ID and return the old value. * This function can be called under the RCU read lock concurrently with * idr_alloc() and idr_remove() (as long as the ID being removed is not * the one being replaced!). * * Returns: the old value on success. %-ENOENT indicates that @id was not * found. %-EINVAL indicates that @ptr was not valid. */ void *idr_replace(struct idr *idr, void *ptr, unsigned long id) { struct radix_tree_node *node; void __rcu **slot = NULL; void *entry; id -= idr->idr_base; entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot); if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE)) return ERR_PTR(-ENOENT); __radix_tree_replace(&idr->idr_rt, node, slot, ptr); return entry; } EXPORT_SYMBOL(idr_replace); /** * DOC: IDA description * * The IDA is an ID allocator which does not provide the ability to * associate an ID with a pointer. As such, it only needs to store one * bit per ID, and so is more space efficient than an IDR. To use an IDA, * define it using DEFINE_IDA() (or embed a &struct ida in a data structure, * then initialise it using ida_init()). To allocate a new ID, call * ida_alloc(), ida_alloc_min(), ida_alloc_max() or ida_alloc_range(). * To free an ID, call ida_free(). * * ida_destroy() can be used to dispose of an IDA without needing to * free the individual IDs in it. You can use ida_is_empty() to find * out whether the IDA has any IDs currently allocated. * * The IDA handles its own locking. It is safe to call any of the IDA * functions without synchronisation in your code. * * IDs are currently limited to the range [0-INT_MAX]. If this is an awkward * limitation, it should be quite straightforward to raise the maximum. */ /* * Developer's notes: * * The IDA uses the functionality provided by the XArray to store bitmaps in * each entry. The XA_FREE_MARK is only cleared when all bits in the bitmap * have been set. * * I considered telling the XArray that each slot is an order-10 node * and indexing by bit number, but the XArray can't allow a single multi-index * entry in the head, which would significantly increase memory consumption * for the IDA. So instead we divide the index by the number of bits in the * leaf bitmap before doing a radix tree lookup. * * As an optimisation, if there are only a few low bits set in any given * leaf, instead of allocating a 128-byte bitmap, we store the bits * as a value entry. Value entries never have the XA_FREE_MARK cleared * because we can always convert them into a bitmap entry. * * It would be possible to optimise further; once we've run out of a * single 128-byte bitmap, we currently switch to a 576-byte node, put * the 128-byte bitmap in the first entry and then start allocating extra * 128-byte entries. We could instead use the 512 bytes of the node's * data as a bitmap before moving to that scheme. I do not believe this * is a worthwhile optimisation; Rasmus Villemoes surveyed the current * users of the IDA and almost none of them use more than 1024 entries. * Those that do use more than the 8192 IDs that the 512 bytes would * provide. * * The IDA always uses a lock to alloc/free. If we add a 'test_bit' * equivalent, it will still need locking. Going to RCU lookup would require * using RCU to free bitmaps, and that's not trivial without embedding an * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte * bitmap, which is excessive. */ /** * ida_alloc_range() - Allocate an unused ID. * @ida: IDA handle. * @min: Lowest ID to allocate. * @max: Highest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between @min and @max, inclusive. The allocated ID will * not exceed %INT_MAX, even if @max is larger. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max, gfp_t gfp) { XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS); unsigned bit = min % IDA_BITMAP_BITS; unsigned long flags; struct ida_bitmap *bitmap, *alloc = NULL; if ((int)min < 0) return -ENOSPC; if ((int)max < 0) max = INT_MAX; retry: xas_lock_irqsave(&xas, flags); next: bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK); if (xas.xa_index > min / IDA_BITMAP_BITS) bit = 0; if (xas.xa_index * IDA_BITMAP_BITS + bit > max) goto nospc; if (xa_is_value(bitmap)) { unsigned long tmp = xa_to_value(bitmap); if (bit < BITS_PER_XA_VALUE) { bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit); if (xas.xa_index * IDA_BITMAP_BITS + bit > max) goto nospc; if (bit < BITS_PER_XA_VALUE) { tmp |= 1UL << bit; xas_store(&xas, xa_mk_value(tmp)); goto out; } } bitmap = alloc; if (!bitmap) bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT); if (!bitmap) goto alloc; bitmap->bitmap[0] = tmp; xas_store(&xas, bitmap); if (xas_error(&xas)) { bitmap->bitmap[0] = 0; goto out; } } if (bitmap) { bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit); if (xas.xa_index * IDA_BITMAP_BITS + bit > max) goto nospc; if (bit == IDA_BITMAP_BITS) goto next; __set_bit(bit, bitmap->bitmap); if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS)) xas_clear_mark(&xas, XA_FREE_MARK); } else { if (bit < BITS_PER_XA_VALUE) { bitmap = xa_mk_value(1UL << bit); } else { bitmap = alloc; if (!bitmap) bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT); if (!bitmap) goto alloc; __set_bit(bit, bitmap->bitmap); } xas_store(&xas, bitmap); } out: xas_unlock_irqrestore(&xas, flags); if (xas_nomem(&xas, gfp)) { xas.xa_index = min / IDA_BITMAP_BITS; bit = min % IDA_BITMAP_BITS; goto retry; } if (bitmap != alloc) kfree(alloc); if (xas_error(&xas)) return xas_error(&xas); return xas.xa_index * IDA_BITMAP_BITS + bit; alloc: xas_unlock_irqrestore(&xas, flags); alloc = kzalloc(sizeof(*bitmap), gfp); if (!alloc) return -ENOMEM; xas_set(&xas, min / IDA_BITMAP_BITS); bit = min % IDA_BITMAP_BITS; goto retry; nospc: xas_unlock_irqrestore(&xas, flags); kfree(alloc); return -ENOSPC; } EXPORT_SYMBOL(ida_alloc_range); /** * ida_free() - Release an allocated ID. * @ida: IDA handle. * @id: Previously allocated ID. * * Context: Any context. It is safe to call this function without * locking in your code. */ void ida_free(struct ida *ida, unsigned int id) { XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS); unsigned bit = id % IDA_BITMAP_BITS; struct ida_bitmap *bitmap; unsigned long flags; if ((int)id < 0) return; xas_lock_irqsave(&xas, flags); bitmap = xas_load(&xas); if (xa_is_value(bitmap)) { unsigned long v = xa_to_value(bitmap); if (bit >= BITS_PER_XA_VALUE) goto err; if (!(v & (1UL << bit))) goto err; v &= ~(1UL << bit); if (!v) goto delete; xas_store(&xas, xa_mk_value(v)); } else { if (!bitmap || !test_bit(bit, bitmap->bitmap)) goto err; __clear_bit(bit, bitmap->bitmap); xas_set_mark(&xas, XA_FREE_MARK); if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) { kfree(bitmap); delete: xas_store(&xas, NULL); } } xas_unlock_irqrestore(&xas, flags); return; err: xas_unlock_irqrestore(&xas, flags); WARN(1, "ida_free called for id=%d which is not allocated.\n", id); } EXPORT_SYMBOL(ida_free); /** * ida_destroy() - Free all IDs. * @ida: IDA handle. * * Calling this function frees all IDs and releases all resources used * by an IDA. When this call returns, the IDA is empty and can be reused * or freed. If the IDA is already empty, there is no need to call this * function. * * Context: Any context. It is safe to call this function without * locking in your code. */ void ida_destroy(struct ida *ida) { XA_STATE(xas, &ida->xa, 0); struct ida_bitmap *bitmap; unsigned long flags; xas_lock_irqsave(&xas, flags); xas_for_each(&xas, bitmap, ULONG_MAX) { if (!xa_is_value(bitmap)) kfree(bitmap); xas_store(&xas, NULL); } xas_unlock_irqrestore(&xas, flags); } EXPORT_SYMBOL(ida_destroy); #ifndef __KERNEL__ extern void xa_dump_index(unsigned long index, unsigned int shift); #define IDA_CHUNK_SHIFT ilog2(IDA_BITMAP_BITS) static void ida_dump_entry(void *entry, unsigned long index) { unsigned long i; if (!entry) return; if (xa_is_node(entry)) { struct xa_node *node = xa_to_node(entry); unsigned int shift = node->shift + IDA_CHUNK_SHIFT + XA_CHUNK_SHIFT; xa_dump_index(index * IDA_BITMAP_BITS, shift); xa_dump_node(node); for (i = 0; i < XA_CHUNK_SIZE; i++) ida_dump_entry(node->slots[i], index | (i << node->shift)); } else if (xa_is_value(entry)) { xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG)); pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry); } else { struct ida_bitmap *bitmap = entry; xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT); pr_cont("bitmap: %p data", bitmap); for (i = 0; i < IDA_BITMAP_LONGS; i++) pr_cont(" %lx", bitmap->bitmap[i]); pr_cont("\n"); } } static void ida_dump(struct ida *ida) { struct xarray *xa = &ida->xa; pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head, xa->xa_flags >> ROOT_TAG_SHIFT); ida_dump_entry(xa->xa_head, 0); } #endif |
4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | /* * Cryptographic API. * * T10 Data Integrity Field CRC16 Crypto Transform * * Copyright (c) 2007 Oracle Corporation. All rights reserved. * Written by Martin K. Petersen <martin.petersen@oracle.com> * Copyright (C) 2013 Intel Corporation * Author: Tim Chen <tim.c.chen@linux.intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/crc-t10dif.h> #include <linux/module.h> #include <linux/kernel.h> /* Table generated using the following polynomium: * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1 * gt: 0x8bb7 */ static const __u16 t10_dif_crc_table[256] = { 0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B, 0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6, 0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6, 0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B, 0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1, 0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C, 0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C, 0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781, 0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8, 0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255, 0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925, 0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698, 0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472, 0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF, 0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF, 0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02, 0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA, 0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067, 0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17, 0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA, 0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640, 0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD, 0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D, 0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30, 0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759, 0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4, 0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394, 0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29, 0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3, 0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E, 0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E, 0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3 }; __u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, size_t len) { unsigned int i; for (i = 0 ; i < len ; i++) crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff]; return crc; } EXPORT_SYMBOL(crc_t10dif_generic); MODULE_DESCRIPTION("T10 DIF CRC calculation common code"); MODULE_LICENSE("GPL"); |
5 1 1 1 1 1 1 1 1 4 4 4 4 4 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 | /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2008 Cisco. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #define pr_fmt(fmt) "user_mad: " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/device.h> #include <linux/err.h> #include <linux/fs.h> #include <linux/cdev.h> #include <linux/dma-mapping.h> #include <linux/poll.h> #include <linux/mutex.h> #include <linux/kref.h> #include <linux/compat.h> #include <linux/sched.h> #include <linux/semaphore.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/uaccess.h> #include <rdma/ib_mad.h> #include <rdma/ib_user_mad.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace MAD packet access"); MODULE_LICENSE("Dual BSD/GPL"); #define MAX_UMAD_RECV_LIST_SIZE 200000 enum { IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS, IB_UMAD_MAX_AGENTS = 32, IB_UMAD_MAJOR = 231, IB_UMAD_MINOR_BASE = 0, IB_UMAD_NUM_FIXED_MINOR = 64, IB_UMAD_NUM_DYNAMIC_MINOR = IB_UMAD_MAX_PORTS - IB_UMAD_NUM_FIXED_MINOR, IB_ISSM_MINOR_BASE = IB_UMAD_NUM_FIXED_MINOR, }; /* * Our lifetime rules for these structs are the following: * device special file is opened, we take a reference on the * ib_umad_port's struct ib_umad_device. We drop these * references in the corresponding close(). * * In addition to references coming from open character devices, there * is one more reference to each ib_umad_device representing the * module's reference taken when allocating the ib_umad_device in * ib_umad_add_one(). * * When destroying an ib_umad_device, we drop the module's reference. */ struct ib_umad_port { struct cdev cdev; struct device dev; struct cdev sm_cdev; struct device sm_dev; struct semaphore sm_sem; struct mutex file_mutex; struct list_head file_list; struct ib_device *ib_dev; struct ib_umad_device *umad_dev; int dev_num; u32 port_num; }; struct ib_umad_device { struct kref kref; struct ib_umad_port ports[]; }; struct ib_umad_file { struct mutex mutex; struct ib_umad_port *port; struct list_head recv_list; atomic_t recv_list_size; struct list_head send_list; struct list_head port_list; spinlock_t send_lock; wait_queue_head_t recv_wait; struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS]; int agents_dead; u8 use_pkey_index; u8 already_used; }; struct ib_umad_packet { struct ib_mad_send_buf *msg; struct ib_mad_recv_wc *recv_wc; struct list_head list; int length; struct ib_user_mad mad; }; struct ib_rmpp_mad_hdr { struct ib_mad_hdr mad_hdr; struct ib_rmpp_hdr rmpp_hdr; } __packed; #define CREATE_TRACE_POINTS #include <trace/events/ib_umad.h> static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) + IB_UMAD_NUM_FIXED_MINOR; static dev_t dynamic_umad_dev; static dev_t dynamic_issm_dev; static DEFINE_IDA(umad_ida); static int ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device, void *client_data); static void ib_umad_dev_free(struct kref *kref) { struct ib_umad_device *dev = container_of(kref, struct ib_umad_device, kref); kfree(dev); } static void ib_umad_dev_get(struct ib_umad_device *dev) { kref_get(&dev->kref); } static void ib_umad_dev_put(struct ib_umad_device *dev) { kref_put(&dev->kref, ib_umad_dev_free); } static int hdr_size(struct ib_umad_file *file) { return file->use_pkey_index ? sizeof(struct ib_user_mad_hdr) : sizeof(struct ib_user_mad_hdr_old); } /* caller must hold file->mutex */ static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id) { return file->agents_dead ? NULL : file->agent[id]; } static int queue_packet(struct ib_umad_file *file, struct ib_mad_agent *agent, struct ib_umad_packet *packet, bool is_recv_mad) { int ret = 1; mutex_lock(&file->mutex); if (is_recv_mad && atomic_read(&file->recv_list_size) > MAX_UMAD_RECV_LIST_SIZE) goto unlock; for (packet->mad.hdr.id = 0; packet->mad.hdr.id < IB_UMAD_MAX_AGENTS; packet->mad.hdr.id++) if (agent == __get_agent(file, packet->mad.hdr.id)) { list_add_tail(&packet->list, &file->recv_list); atomic_inc(&file->recv_list_size); wake_up_interruptible(&file->recv_wait); ret = 0; break; } unlock: mutex_unlock(&file->mutex); return ret; } static void dequeue_send(struct ib_umad_file *file, struct ib_umad_packet *packet) { spin_lock_irq(&file->send_lock); list_del(&packet->list); spin_unlock_irq(&file->send_lock); } static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *send_wc) { struct ib_umad_file *file = agent->context; struct ib_umad_packet *packet = send_wc->send_buf->context[0]; dequeue_send(file, packet); rdma_destroy_ah(packet->msg->ah, RDMA_DESTROY_AH_SLEEPABLE); ib_free_send_mad(packet->msg); if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) { packet->length = IB_MGMT_MAD_HDR; packet->mad.hdr.status = ETIMEDOUT; if (!queue_packet(file, agent, packet, false)) return; } kfree(packet); } static void recv_handler(struct ib_mad_agent *agent, struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_umad_file *file = agent->context; struct ib_umad_packet *packet; if (mad_recv_wc->wc->status != IB_WC_SUCCESS) goto err1; packet = kzalloc(sizeof *packet, GFP_KERNEL); if (!packet) goto err1; packet->length = mad_recv_wc->mad_len; packet->recv_wc = mad_recv_wc; packet->mad.hdr.status = 0; packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len; packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); /* * On OPA devices it is okay to lose the upper 16 bits of LID as this * information is obtained elsewhere. Mask off the upper 16 bits. */ if (rdma_cap_opa_mad(agent->device, agent->port_num)) packet->mad.hdr.lid = ib_lid_be16(0xFFFF & mad_recv_wc->wc->slid); else packet->mad.hdr.lid = ib_lid_be16(mad_recv_wc->wc->slid); packet->mad.hdr.sl = mad_recv_wc->wc->sl; packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits; packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index; packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH); if (packet->mad.hdr.grh_present) { struct rdma_ah_attr ah_attr; const struct ib_global_route *grh; int ret; ret = ib_init_ah_attr_from_wc(agent->device, agent->port_num, mad_recv_wc->wc, mad_recv_wc->recv_buf.grh, &ah_attr); if (ret) goto err2; grh = rdma_ah_read_grh(&ah_attr); packet->mad.hdr.gid_index = grh->sgid_index; packet->mad.hdr.hop_limit = grh->hop_limit; packet->mad.hdr.traffic_class = grh->traffic_class; memcpy(packet->mad.hdr.gid, &grh->dgid, 16); packet->mad.hdr.flow_label = cpu_to_be32(grh->flow_label); rdma_destroy_ah_attr(&ah_attr); } if (queue_packet(file, agent, packet, true)) goto err2; return; err2: kfree(packet); err1: ib_free_recv_mad(mad_recv_wc); } static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf, struct ib_umad_packet *packet, size_t count) { struct ib_mad_recv_buf *recv_buf; int left, seg_payload, offset, max_seg_payload; size_t seg_size; recv_buf = &packet->recv_wc->recv_buf; seg_size = packet->recv_wc->mad_seg_size; /* We need enough room to copy the first (or only) MAD segment. */ if ((packet->length <= seg_size && count < hdr_size(file) + packet->length) || (packet->length > seg_size && count < hdr_size(file) + seg_size)) return -EINVAL; if (copy_to_user(buf, &packet->mad, hdr_size(file))) return -EFAULT; buf += hdr_size(file); seg_payload = min_t(int, packet->length, seg_size); if (copy_to_user(buf, recv_buf->mad, seg_payload)) return -EFAULT; if (seg_payload < packet->length) { /* * Multipacket RMPP MAD message. Copy remainder of message. * Note that last segment may have a shorter payload. */ if (count < hdr_size(file) + packet->length) { /* * The buffer is too small, return the first RMPP segment, * which includes the RMPP message length. */ return -ENOSPC; } offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class); max_seg_payload = seg_size - offset; for (left = packet->length - seg_payload, buf += seg_payload; left; left -= seg_payload, buf += seg_payload) { recv_buf = container_of(recv_buf->list.next, struct ib_mad_recv_buf, list); seg_payload = min(left, max_seg_payload); if (copy_to_user(buf, ((void *) recv_buf->mad) + offset, seg_payload)) return -EFAULT; } } trace_ib_umad_read_recv(file, &packet->mad.hdr, &recv_buf->mad->mad_hdr); return hdr_size(file) + packet->length; } static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf, struct ib_umad_packet *packet, size_t count) { ssize_t size = hdr_size(file) + packet->length; if (count < size) return -EINVAL; if (copy_to_user(buf, &packet->mad, hdr_size(file))) return -EFAULT; buf += hdr_size(file); if (copy_to_user(buf, packet->mad.data, packet->length)) return -EFAULT; trace_ib_umad_read_send(file, &packet->mad.hdr, (struct ib_mad_hdr *)&packet->mad.data); return size; } static ssize_t ib_umad_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { struct ib_umad_file *file = filp->private_data; struct ib_umad_packet *packet; ssize_t ret; if (count < hdr_size(file)) return -EINVAL; mutex_lock(&file->mutex); if (file->agents_dead) { mutex_unlock(&file->mutex); return -EIO; } while (list_empty(&file->recv_list)) { mutex_unlock(&file->mutex); if (filp->f_flags & O_NONBLOCK) return -EAGAIN; if (wait_event_interruptible(file->recv_wait, !list_empty(&file->recv_list))) return -ERESTARTSYS; mutex_lock(&file->mutex); } if (file->agents_dead) { mutex_unlock(&file->mutex); return -EIO; } packet = list_entry(file->recv_list.next, struct ib_umad_packet, list); list_del(&packet->list); atomic_dec(&file->recv_list_size); mutex_unlock(&file->mutex); if (packet->recv_wc) ret = copy_recv_mad(file, buf, packet, count); else ret = copy_send_mad(file, buf, packet, count); if (ret < 0) { /* Requeue packet */ mutex_lock(&file->mutex); list_add(&packet->list, &file->recv_list); atomic_inc(&file->recv_list_size); mutex_unlock(&file->mutex); } else { if (packet->recv_wc) ib_free_recv_mad(packet->recv_wc); kfree(packet); } return ret; } static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf) { int left, seg; /* Copy class specific header */ if ((msg->hdr_len > IB_MGMT_RMPP_HDR) && copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR, msg->hdr_len - IB_MGMT_RMPP_HDR)) return -EFAULT; /* All headers are in place. Copy data segments. */ for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0; seg++, left -= msg->seg_size, buf += msg->seg_size) { if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf, min(left, msg->seg_size))) return -EFAULT; } return 0; } static int same_destination(struct ib_user_mad_hdr *hdr1, struct ib_user_mad_hdr *hdr2) { if (!hdr1->grh_present && !hdr2->grh_present) return (hdr1->lid == hdr2->lid); if (hdr1->grh_present && hdr2->grh_present) return !memcmp(hdr1->gid, hdr2->gid, 16); return 0; } static int is_duplicate(struct ib_umad_file *file, struct ib_umad_packet *packet) { struct ib_umad_packet *sent_packet; struct ib_mad_hdr *sent_hdr, *hdr; hdr = (struct ib_mad_hdr *) packet->mad.data; list_for_each_entry(sent_packet, &file->send_list, list) { sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data; if ((hdr->tid != sent_hdr->tid) || (hdr->mgmt_class != sent_hdr->mgmt_class)) continue; /* * No need to be overly clever here. If two new operations have * the same TID, reject the second as a duplicate. This is more * restrictive than required by the spec. */ if (!ib_response_mad(hdr)) { if (!ib_response_mad(sent_hdr)) return 1; continue; } else if (!ib_response_mad(sent_hdr)) continue; if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr)) return 1; } return 0; } static ssize_t ib_umad_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct ib_umad_file *file = filp->private_data; struct ib_rmpp_mad_hdr *rmpp_mad_hdr; struct ib_umad_packet *packet; struct ib_mad_agent *agent; struct rdma_ah_attr ah_attr; struct ib_ah *ah; __be64 *tid; int ret, data_len, hdr_len, copy_offset, rmpp_active; u8 base_version; if (count < hdr_size(file) + IB_MGMT_RMPP_HDR) return -EINVAL; packet = kzalloc(sizeof(*packet) + IB_MGMT_RMPP_HDR, GFP_KERNEL); if (!packet) return -ENOMEM; if (copy_from_user(&packet->mad, buf, hdr_size(file))) { ret = -EFAULT; goto err; } if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) { ret = -EINVAL; goto err; } buf += hdr_size(file); if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) { ret = -EFAULT; goto err; } mutex_lock(&file->mutex); trace_ib_umad_write(file, &packet->mad.hdr, (struct ib_mad_hdr *)&packet->mad.data); agent = __get_agent(file, packet->mad.hdr.id); if (!agent) { ret = -EIO; goto err_up; } memset(&ah_attr, 0, sizeof ah_attr); ah_attr.type = rdma_ah_find_type(agent->device, file->port->port_num); rdma_ah_set_dlid(&ah_attr, be16_to_cpu(packet->mad.hdr.lid)); rdma_ah_set_sl(&ah_attr, packet->mad.hdr.sl); rdma_ah_set_path_bits(&ah_attr, packet->mad.hdr.path_bits); rdma_ah_set_port_num(&ah_attr, file->port->port_num); if (packet->mad.hdr.grh_present) { rdma_ah_set_grh(&ah_attr, NULL, be32_to_cpu(packet->mad.hdr.flow_label), packet->mad.hdr.gid_index, packet->mad.hdr.hop_limit, packet->mad.hdr.traffic_class); rdma_ah_set_dgid_raw(&ah_attr, packet->mad.hdr.gid); } ah = rdma_create_user_ah(agent->qp->pd, &ah_attr, NULL); if (IS_ERR(ah)) { ret = PTR_ERR(ah); goto err_up; } rmpp_mad_hdr = (struct ib_rmpp_mad_hdr *)packet->mad.data; hdr_len = ib_get_mad_data_offset(rmpp_mad_hdr->mad_hdr.mgmt_class); if (ib_is_mad_class_rmpp(rmpp_mad_hdr->mad_hdr.mgmt_class) && ib_mad_kernel_rmpp_agent(agent)) { copy_offset = IB_MGMT_RMPP_HDR; rmpp_active = ib_get_rmpp_flags(&rmpp_mad_hdr->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE; } else { copy_offset = IB_MGMT_MAD_HDR; rmpp_active = 0; } base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version; data_len = count - hdr_size(file) - hdr_len; packet->msg = ib_create_send_mad(agent, be32_to_cpu(packet->mad.hdr.qpn), packet->mad.hdr.pkey_index, rmpp_active, hdr_len, data_len, GFP_KERNEL, base_version); if (IS_ERR(packet->msg)) { ret = PTR_ERR(packet->msg); goto err_ah; } packet->msg->ah = ah; packet->msg->timeout_ms = packet->mad.hdr.timeout_ms; packet->msg->retries = packet->mad.hdr.retries; packet->msg->context[0] = packet; /* Copy MAD header. Any RMPP header is already in place. */ memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR); if (!rmpp_active) { if (copy_from_user(packet->msg->mad + copy_offset, buf + copy_offset, hdr_len + data_len - copy_offset)) { ret = -EFAULT; goto err_msg; } } else { ret = copy_rmpp_mad(packet->msg, buf); if (ret) goto err_msg; } /* * Set the high-order part of the transaction ID to make MADs from * different agents unique, and allow routing responses back to the * original requestor. */ if (!ib_response_mad(packet->msg->mad)) { tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid; *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 | (be64_to_cpup(tid) & 0xffffffff)); rmpp_mad_hdr->mad_hdr.tid = *tid; } if (!ib_mad_kernel_rmpp_agent(agent) && ib_is_mad_class_rmpp(rmpp_mad_hdr->mad_hdr.mgmt_class) && (ib_get_rmpp_flags(&rmpp_mad_hdr->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { spin_lock_irq(&file->send_lock); list_add_tail(&packet->list, &file->send_list); spin_unlock_irq(&file->send_lock); } else { spin_lock_irq(&file->send_lock); ret = is_duplicate(file, packet); if (!ret) list_add_tail(&packet->list, &file->send_list); spin_unlock_irq(&file->send_lock); if (ret) { ret = -EINVAL; goto err_msg; } } ret = ib_post_send_mad(packet->msg, NULL); if (ret) goto err_send; mutex_unlock(&file->mutex); return count; err_send: dequeue_send(file, packet); err_msg: ib_free_send_mad(packet->msg); err_ah: rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE); err_up: mutex_unlock(&file->mutex); err: kfree(packet); return ret; } static __poll_t ib_umad_poll(struct file *filp, struct poll_table_struct *wait) { struct ib_umad_file *file = filp->private_data; /* we will always be able to post a MAD send */ __poll_t mask = EPOLLOUT | EPOLLWRNORM; mutex_lock(&file->mutex); poll_wait(filp, &file->recv_wait, wait); if (!list_empty(&file->recv_list)) mask |= EPOLLIN | EPOLLRDNORM; if (file->agents_dead) mask = EPOLLERR; mutex_unlock(&file->mutex); return mask; } static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, int compat_method_mask) { struct ib_user_mad_reg_req ureq; struct ib_mad_reg_req req; struct ib_mad_agent *agent = NULL; int agent_id; int ret; mutex_lock(&file->port->file_mutex); mutex_lock(&file->mutex); if (!file->port->ib_dev) { dev_notice(&file->port->dev, "%s: invalid device\n", __func__); ret = -EPIPE; goto out; } if (copy_from_user(&ureq, arg, sizeof ureq)) { ret = -EFAULT; goto out; } if (ureq.qpn != 0 && ureq.qpn != 1) { dev_notice(&file->port->dev, "%s: invalid QPN %u specified\n", __func__, ureq.qpn); ret = -EINVAL; goto out; } for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id) if (!__get_agent(file, agent_id)) goto found; dev_notice(&file->port->dev, "%s: Max Agents (%u) reached\n", __func__, IB_UMAD_MAX_AGENTS); ret = -ENOMEM; goto out; found: if (ureq.mgmt_class) { memset(&req, 0, sizeof(req)); req.mgmt_class = ureq.mgmt_class; req.mgmt_class_version = ureq.mgmt_class_version; memcpy(req.oui, ureq.oui, sizeof req.oui); if (compat_method_mask) { u32 *umm = (u32 *) ureq.method_mask; int i; for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i) req.method_mask[i] = umm[i * 2] | ((u64) umm[i * 2 + 1] << 32); } else memcpy(req.method_mask, ureq.method_mask, sizeof req.method_mask); } agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, ureq.mgmt_class ? &req : NULL, ureq.rmpp_version, send_handler, recv_handler, file, 0); if (IS_ERR(agent)) { ret = PTR_ERR(agent); agent = NULL; goto out; } if (put_user(agent_id, (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) { ret = -EFAULT; goto out; } if (!file->already_used) { file->already_used = 1; if (!file->use_pkey_index) { dev_warn(&file->port->dev, "process %s did not enable P_Key index support.\n", current->comm); dev_warn(&file->port->dev, " Documentation/infiniband/user_mad.rst has info on the new ABI.\n"); } } file->agent[agent_id] = agent; ret = 0; out: mutex_unlock(&file->mutex); if (ret && agent) ib_unregister_mad_agent(agent); mutex_unlock(&file->port->file_mutex); return ret; } static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) { struct ib_user_mad_reg_req2 ureq; struct ib_mad_reg_req req; struct ib_mad_agent *agent = NULL; int agent_id; int ret; mutex_lock(&file->port->file_mutex); mutex_lock(&file->mutex); if (!file->port->ib_dev) { dev_notice(&file->port->dev, "%s: invalid device\n", __func__); ret = -EPIPE; goto out; } if (copy_from_user(&ureq, arg, sizeof(ureq))) { ret = -EFAULT; goto out; } if (ureq.qpn != 0 && ureq.qpn != 1) { dev_notice(&file->port->dev, "%s: invalid QPN %u specified\n", __func__, ureq.qpn); ret = -EINVAL; goto out; } if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) { dev_notice(&file->port->dev, "%s failed: invalid registration flags specified 0x%x; supported 0x%x\n", __func__, ureq.flags, IB_USER_MAD_REG_FLAGS_CAP); ret = -EINVAL; if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP, (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req2, flags)))) ret = -EFAULT; goto out; } for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id) if (!__get_agent(file, agent_id)) goto found; dev_notice(&file->port->dev, "%s: Max Agents (%u) reached\n", __func__, IB_UMAD_MAX_AGENTS); ret = -ENOMEM; goto out; found: if (ureq.mgmt_class) { memset(&req, 0, sizeof(req)); req.mgmt_class = ureq.mgmt_class; req.mgmt_class_version = ureq.mgmt_class_version; if (ureq.oui & 0xff000000) { dev_notice(&file->port->dev, "%s failed: oui invalid 0x%08x\n", __func__, ureq.oui); ret = -EINVAL; goto out; } req.oui[2] = ureq.oui & 0x0000ff; req.oui[1] = (ureq.oui & 0x00ff00) >> 8; req.oui[0] = (ureq.oui & 0xff0000) >> 16; memcpy(req.method_mask, ureq.method_mask, sizeof(req.method_mask)); } agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, ureq.mgmt_class ? &req : NULL, ureq.rmpp_version, send_handler, recv_handler, file, ureq.flags); if (IS_ERR(agent)) { ret = PTR_ERR(agent); agent = NULL; goto out; } if (put_user(agent_id, (u32 __user *)(arg + offsetof(struct ib_user_mad_reg_req2, id)))) { ret = -EFAULT; goto out; } if (!file->already_used) { file->already_used = 1; file->use_pkey_index = 1; } file->agent[agent_id] = agent; ret = 0; out: mutex_unlock(&file->mutex); if (ret && agent) ib_unregister_mad_agent(agent); mutex_unlock(&file->port->file_mutex); return ret; } static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) { struct ib_mad_agent *agent = NULL; u32 id; int ret = 0; if (get_user(id, arg)) return -EFAULT; if (id >= IB_UMAD_MAX_AGENTS) return -EINVAL; mutex_lock(&file->port->file_mutex); mutex_lock(&file->mutex); id = array_index_nospec(id, IB_UMAD_MAX_AGENTS); if (!__get_agent(file, id)) { ret = -EINVAL; goto out; } agent = file->agent[id]; file->agent[id] = NULL; out: mutex_unlock(&file->mutex); if (agent) ib_unregister_mad_agent(agent); mutex_unlock(&file->port->file_mutex); return ret; } static long ib_umad_enable_pkey(struct ib_umad_file *file) { int ret = 0; mutex_lock(&file->mutex); if (file->already_used) ret = -EINVAL; else file->use_pkey_index = 1; mutex_unlock(&file->mutex); return ret; } static long ib_umad_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case IB_USER_MAD_REGISTER_AGENT: return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0); case IB_USER_MAD_UNREGISTER_AGENT: return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); case IB_USER_MAD_REGISTER_AGENT2: return ib_umad_reg_agent2(filp->private_data, (void __user *) arg); default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case IB_USER_MAD_REGISTER_AGENT: return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1); case IB_USER_MAD_UNREGISTER_AGENT: return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg)); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); case IB_USER_MAD_REGISTER_AGENT2: return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg)); default: return -ENOIOCTLCMD; } } #endif /* * ib_umad_open() does not need the BKL: * * - the ib_umad_port structures are properly reference counted, and * everything else is purely local to the file being created, so * races against other open calls are not a problem; * - the ioctl method does not affect any global state outside of the * file structure being operated on; */ static int ib_umad_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; struct ib_umad_file *file; int ret = 0; port = container_of(inode->i_cdev, struct ib_umad_port, cdev); mutex_lock(&port->file_mutex); if (!port->ib_dev) { ret = -ENXIO; goto out; } if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) { ret = -EPERM; goto out; } file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) { ret = -ENOMEM; goto out; } mutex_init(&file->mutex); spin_lock_init(&file->send_lock); INIT_LIST_HEAD(&file->recv_list); INIT_LIST_HEAD(&file->send_list); init_waitqueue_head(&file->recv_wait); file->port = port; filp->private_data = file; list_add_tail(&file->port_list, &port->file_list); stream_open(inode, filp); out: mutex_unlock(&port->file_mutex); return ret; } static int ib_umad_close(struct inode *inode, struct file *filp) { struct ib_umad_file *file = filp->private_data; struct ib_umad_packet *packet, *tmp; int already_dead; int i; mutex_lock(&file->port->file_mutex); mutex_lock(&file->mutex); already_dead = file->agents_dead; file->agents_dead = 1; list_for_each_entry_safe(packet, tmp, &file->recv_list, list) { if (packet->recv_wc) ib_free_recv_mad(packet->recv_wc); kfree(packet); } list_del(&file->port_list); mutex_unlock(&file->mutex); if (!already_dead) for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i) if (file->agent[i]) ib_unregister_mad_agent(file->agent[i]); mutex_unlock(&file->port->file_mutex); mutex_destroy(&file->mutex); kfree(file); return 0; } static const struct file_operations umad_fops = { .owner = THIS_MODULE, .read = ib_umad_read, .write = ib_umad_write, .poll = ib_umad_poll, .unlocked_ioctl = ib_umad_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ib_umad_compat_ioctl, #endif .open = ib_umad_open, .release = ib_umad_close, }; static int ib_umad_sm_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; struct ib_port_modify props = { .set_port_cap_mask = IB_PORT_SM }; int ret; port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev); if (filp->f_flags & O_NONBLOCK) { if (down_trylock(&port->sm_sem)) { ret = -EAGAIN; goto fail; } } else { if (down_interruptible(&port->sm_sem)) { ret = -ERESTARTSYS; goto fail; } } if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) { ret = -EPERM; goto err_up_sem; } ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); if (ret) goto err_up_sem; filp->private_data = port; nonseekable_open(inode, filp); return 0; err_up_sem: up(&port->sm_sem); fail: return ret; } static int ib_umad_sm_close(struct inode *inode, struct file *filp) { struct ib_umad_port *port = filp->private_data; struct ib_port_modify props = { .clr_port_cap_mask = IB_PORT_SM }; int ret = 0; mutex_lock(&port->file_mutex); if (port->ib_dev) ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); mutex_unlock(&port->file_mutex); up(&port->sm_sem); return ret; } static const struct file_operations umad_sm_fops = { .owner = THIS_MODULE, .open = ib_umad_sm_open, .release = ib_umad_sm_close, }; static struct ib_umad_port *get_port(struct ib_device *ibdev, struct ib_umad_device *umad_dev, u32 port) { if (!umad_dev) return ERR_PTR(-EOPNOTSUPP); if (!rdma_is_port_valid(ibdev, port)) return ERR_PTR(-EINVAL); if (!rdma_cap_ib_mad(ibdev, port)) return ERR_PTR(-EOPNOTSUPP); return &umad_dev->ports[port - rdma_start_port(ibdev)]; } static int ib_umad_get_nl_info(struct ib_device *ibdev, void *client_data, struct ib_client_nl_info *res) { struct ib_umad_port *port = get_port(ibdev, client_data, res->port); if (IS_ERR(port)) return PTR_ERR(port); res->abi = IB_USER_MAD_ABI_VERSION; res->cdev = &port->dev; return 0; } static struct ib_client umad_client = { .name = "umad", .add = ib_umad_add_one, .remove = ib_umad_remove_one, .get_nl_info = ib_umad_get_nl_info, }; MODULE_ALIAS_RDMA_CLIENT("umad"); static int ib_issm_get_nl_info(struct ib_device *ibdev, void *client_data, struct ib_client_nl_info *res) { struct ib_umad_port *port = get_port(ibdev, client_data, res->port); if (IS_ERR(port)) return PTR_ERR(port); res->abi = IB_USER_MAD_ABI_VERSION; res->cdev = &port->sm_dev; return 0; } static struct ib_client issm_client = { .name = "issm", .get_nl_info = ib_issm_get_nl_info, }; MODULE_ALIAS_RDMA_CLIENT("issm"); static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr, char *buf) { struct ib_umad_port *port = dev_get_drvdata(dev); if (!port) return -ENODEV; return sysfs_emit(buf, "%s\n", dev_name(&port->ib_dev->dev)); } static DEVICE_ATTR_RO(ibdev); static ssize_t port_show(struct device *dev, struct device_attribute *attr, char *buf) { struct ib_umad_port *port = dev_get_drvdata(dev); if (!port) return -ENODEV; return sysfs_emit(buf, "%d\n", port->port_num); } static DEVICE_ATTR_RO(port); static struct attribute *umad_class_dev_attrs[] = { &dev_attr_ibdev.attr, &dev_attr_port.attr, NULL, }; ATTRIBUTE_GROUPS(umad_class_dev); static char *umad_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); } static ssize_t abi_version_show(const struct class *class, const struct class_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", IB_USER_MAD_ABI_VERSION); } static CLASS_ATTR_RO(abi_version); static struct attribute *umad_class_attrs[] = { &class_attr_abi_version.attr, NULL, }; ATTRIBUTE_GROUPS(umad_class); static struct class umad_class = { .name = "infiniband_mad", .devnode = umad_devnode, .class_groups = umad_class_groups, .dev_groups = umad_class_dev_groups, }; static void ib_umad_release_port(struct device *device) { struct ib_umad_port *port = dev_get_drvdata(device); struct ib_umad_device *umad_dev = port->umad_dev; ib_umad_dev_put(umad_dev); } static void ib_umad_init_port_dev(struct device *dev, struct ib_umad_port *port, const struct ib_device *device) { device_initialize(dev); ib_umad_dev_get(port->umad_dev); dev->class = &umad_class; dev->parent = device->dev.parent; dev_set_drvdata(dev, port); dev->release = ib_umad_release_port; } static int ib_umad_init_port(struct ib_device *device, int port_num, struct ib_umad_device *umad_dev, struct ib_umad_port *port) { int devnum; dev_t base_umad; dev_t base_issm; int ret; devnum = ida_alloc_max(&umad_ida, IB_UMAD_MAX_PORTS - 1, GFP_KERNEL); if (devnum < 0) return -1; port->dev_num = devnum; if (devnum >= IB_UMAD_NUM_FIXED_MINOR) { base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; } else { base_umad = devnum + base_umad_dev; base_issm = devnum + base_issm_dev; } port->ib_dev = device; port->umad_dev = umad_dev; port->port_num = port_num; sema_init(&port->sm_sem, 1); mutex_init(&port->file_mutex); INIT_LIST_HEAD(&port->file_list); ib_umad_init_port_dev(&port->dev, port, device); port->dev.devt = base_umad; dev_set_name(&port->dev, "umad%d", port->dev_num); cdev_init(&port->cdev, &umad_fops); port->cdev.owner = THIS_MODULE; ret = cdev_device_add(&port->cdev, &port->dev); if (ret) goto err_cdev; if (rdma_cap_ib_smi(device, port_num)) { ib_umad_init_port_dev(&port->sm_dev, port, device); port->sm_dev.devt = base_issm; dev_set_name(&port->sm_dev, "issm%d", port->dev_num); cdev_init(&port->sm_cdev, &umad_sm_fops); port->sm_cdev.owner = THIS_MODULE; ret = cdev_device_add(&port->sm_cdev, &port->sm_dev); if (ret) goto err_dev; } return 0; err_dev: put_device(&port->sm_dev); cdev_device_del(&port->cdev, &port->dev); err_cdev: put_device(&port->dev); ida_free(&umad_ida, devnum); return ret; } static void ib_umad_kill_port(struct ib_umad_port *port) { struct ib_umad_file *file; bool has_smi = false; int id; if (rdma_cap_ib_smi(port->ib_dev, port->port_num)) { cdev_device_del(&port->sm_cdev, &port->sm_dev); has_smi = true; } cdev_device_del(&port->cdev, &port->dev); mutex_lock(&port->file_mutex); /* Mark ib_dev NULL and block ioctl or other file ops to progress * further. */ port->ib_dev = NULL; list_for_each_entry(file, &port->file_list, port_list) { mutex_lock(&file->mutex); file->agents_dead = 1; wake_up_interruptible(&file->recv_wait); mutex_unlock(&file->mutex); for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id) if (file->agent[id]) ib_unregister_mad_agent(file->agent[id]); } mutex_unlock(&port->file_mutex); ida_free(&umad_ida, port->dev_num); /* balances device_initialize() */ if (has_smi) put_device(&port->sm_dev); put_device(&port->dev); } static int ib_umad_add_one(struct ib_device *device) { struct ib_umad_device *umad_dev; int s, e, i; int count = 0; int ret; s = rdma_start_port(device); e = rdma_end_port(device); umad_dev = kzalloc(struct_size(umad_dev, ports, size_add(size_sub(e, s), 1)), GFP_KERNEL); if (!umad_dev) return -ENOMEM; kref_init(&umad_dev->kref); for (i = s; i <= e; ++i) { if (!rdma_cap_ib_mad(device, i)) continue; ret = ib_umad_init_port(device, i, umad_dev, &umad_dev->ports[i - s]); if (ret) goto err; count++; } if (!count) { ret = -EOPNOTSUPP; goto free; } ib_set_client_data(device, &umad_client, umad_dev); return 0; err: while (--i >= s) { if (!rdma_cap_ib_mad(device, i)) continue; ib_umad_kill_port(&umad_dev->ports[i - s]); } free: /* balances kref_init */ ib_umad_dev_put(umad_dev); return ret; } static void ib_umad_remove_one(struct ib_device *device, void *client_data) { struct ib_umad_device *umad_dev = client_data; unsigned int i; rdma_for_each_port (device, i) { if (rdma_cap_ib_mad(device, i)) ib_umad_kill_port( &umad_dev->ports[i - rdma_start_port(device)]); } /* balances kref_init() */ ib_umad_dev_put(umad_dev); } static int __init ib_umad_init(void) { int ret; ret = register_chrdev_region(base_umad_dev, IB_UMAD_NUM_FIXED_MINOR * 2, umad_class.name); if (ret) { pr_err("couldn't register device number\n"); goto out; } ret = alloc_chrdev_region(&dynamic_umad_dev, 0, IB_UMAD_NUM_DYNAMIC_MINOR * 2, umad_class.name); if (ret) { pr_err("couldn't register dynamic device number\n"); goto out_alloc; } dynamic_issm_dev = dynamic_umad_dev + IB_UMAD_NUM_DYNAMIC_MINOR; ret = class_register(&umad_class); if (ret) { pr_err("couldn't create class infiniband_mad\n"); goto out_chrdev; } ret = ib_register_client(&umad_client); if (ret) goto out_class; ret = ib_register_client(&issm_client); if (ret) goto out_client; return 0; out_client: ib_unregister_client(&umad_client); out_class: class_unregister(&umad_class); out_chrdev: unregister_chrdev_region(dynamic_umad_dev, IB_UMAD_NUM_DYNAMIC_MINOR * 2); out_alloc: unregister_chrdev_region(base_umad_dev, IB_UMAD_NUM_FIXED_MINOR * 2); out: return ret; } static void __exit ib_umad_cleanup(void) { ib_unregister_client(&issm_client); ib_unregister_client(&umad_client); class_unregister(&umad_class); unregister_chrdev_region(base_umad_dev, IB_UMAD_NUM_FIXED_MINOR * 2); unregister_chrdev_region(dynamic_umad_dev, IB_UMAD_NUM_DYNAMIC_MINOR * 2); } module_init(ib_umad_init); module_exit(ib_umad_cleanup); |
950 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM fib6 #if !defined(_TRACE_FIB6_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FIB6_H #include <linux/in6.h> #include <net/flow.h> #include <net/ip6_fib.h> #include <linux/tracepoint.h> TRACE_EVENT(fib6_table_lookup, TP_PROTO(const struct net *net, const struct fib6_result *res, struct fib6_table *table, const struct flowi6 *flp), TP_ARGS(net, res, table, flp), TP_STRUCT__entry( __field( u32, tb_id ) __field( int, err ) __field( int, oif ) __field( int, iif ) __field( __u8, tos ) __field( __u8, scope ) __field( __u8, flags ) __array( __u8, src, 16 ) __array( __u8, dst, 16 ) __field( u16, sport ) __field( u16, dport ) __field( u8, proto ) __field( u8, rt_type ) __array( char, name, IFNAMSIZ ) __array( __u8, gw, 16 ) ), TP_fast_assign( struct in6_addr *in6; __entry->tb_id = table->tb6_id; __entry->err = ip6_rt_type_to_error(res->fib6_type); __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->tos = ip6_tclass(flp->flowlabel); __entry->scope = flp->flowi6_scope; __entry->flags = flp->flowi6_flags; in6 = (struct in6_addr *)__entry->src; *in6 = flp->saddr; in6 = (struct in6_addr *)__entry->dst; *in6 = flp->daddr; __entry->proto = flp->flowi6_proto; if (__entry->proto == IPPROTO_TCP || __entry->proto == IPPROTO_UDP) { __entry->sport = ntohs(flp->fl6_sport); __entry->dport = ntohs(flp->fl6_dport); } else { __entry->sport = 0; __entry->dport = 0; } if (res->nh && res->nh->fib_nh_dev) { strscpy(__entry->name, res->nh->fib_nh_dev->name, IFNAMSIZ); } else { strcpy(__entry->name, "-"); } if (res->f6i == net->ipv6.fib6_null_entry) { in6 = (struct in6_addr *)__entry->gw; *in6 = in6addr_any; } else if (res->nh) { in6 = (struct in6_addr *)__entry->gw; *in6 = res->nh->fib_nh_gw6; } ), TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d", __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, __entry->src, __entry->sport, __entry->dst, __entry->dport, __entry->tos, __entry->scope, __entry->flags, __entry->name, __entry->gw, __entry->err) ); #endif /* _TRACE_FIB6_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
4 4 4 4 4 4 4 4 4 4 4 5 4 4 4 1 4 4 4 5 4 4 4 4 4 4 4 4 1 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) * * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/printk.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/iomap.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" #include <trace/events/ext4.h> static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct folio *folio, unsigned from, unsigned to); static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); csum = ext4_chksum(sbi, csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; } static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 provided, calculated; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); calculated = ext4_inode_csum(inode, raw, ei); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; else calculated &= 0xFFFF; return provided == calculated; } void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 csum; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) raw->i_checksum_hi = cpu_to_le16(csum >> 16); } static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { trace_ext4_begin_ordered_truncate(inode, new_size); /* * If jinode is zero, then we never opened the file for * writing, so there's no need to call * jbd2_journal_begin_ordered_truncate() since there's no * outstanding writes we need to flush. */ if (!EXT4_I(inode)->jinode) return 0; return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode, new_size); } static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; if (ext4_has_inline_data(inode)) return 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } return S_ISLNK(inode->i_mode) && inode->i_size && (inode->i_size < EXT4_N_BLOCKS * 4); } /* * Called at the last iput() if i_nlink is zero. */ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; /* * Credits for final inode cleanup and freeing: * sb + inode (ext4_orphan_del()), block bitmap, group descriptor * (xattr block freeing), bitmap, group descriptor (inode freeing) */ int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; bool freeze_protected = false; trace_ext4_evict_inode(inode); if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { truncate_inode_pages_final(&inode->i_data); goto no_delete; } if (is_bad_inode(inode)) goto no_delete; dquot_initialize(inode); if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); /* * For inodes with journalled data, transaction commit could have * dirtied the inode. And for inodes with dioread_nolock, unwritten * extents converting worker could merge extents and also have dirtied * the inode. Flush worker is ignoring it because of I_FREEING flag but * we still need to remove the inode from the writeback lists. */ if (!list_empty_careful(&inode->i_io_list)) inode_io_list_del(inode); /* * Protect us against freezing - iput() caller didn't have to have any * protection against it. When we are in a running transaction though, * we are already protected against freezing and we cannot grab further * protection due to lock ordering constraints. */ if (!ext4_journal_current_handle()) { sb_start_intwrite(inode->i_sb); freeze_protected = true; } if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); /* * Block bitmap, group descriptor, and inode are accounted in both * ext4_blocks_for_truncate() and extra_credits. So subtract 3. */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* * If we're going to skip the normal cleanup, we still need to * make sure that the in-core orphan linked list is properly * cleaned up. */ ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); goto no_delete; } if (IS_SYNC(inode)) ext4_handle_sync(handle); /* * Set inode->i_size to 0 before calling ext4_truncate(). We need * special handling of symlinks here because i_size is used to * determine whether ext4_inode_info->i_data contains symlink data or * block mappings. Setting i_size to 0 will remove its fast symlink * status. Erase i_data so that it becomes a valid empty block map. */ if (ext4_inode_is_fast_symlink(inode)) memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_warning(inode->i_sb, "couldn't mark inode dirty (err %d)", err); goto stop_handle; } if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { ext4_error_err(inode->i_sb, -err, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); goto stop_handle; } } /* Remove xattr references. */ err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, extra_credits); if (err) { ext4_warning(inode->i_sb, "xattr delete (err %d)", err); stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); goto no_delete; } /* * Kill off the orphan record which ext4_truncate created. * AKPM: I think this can be inside the above `if'. * Note that ext4_orphan_del() has to be able to cope with the * deletion of a non-existent orphan - this is because we don't * know if ext4_truncate() actually created an orphan record. * (Well, we could do this if we need to, but heck - it works) */ ext4_orphan_del(handle, inode); EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); /* * One subtle ordering requirement: if anything has gone wrong * (transaction abort, IO errors, whatever), then we can still * do these next steps (the fs will already have been marked as * having errors), but we can't free the inode if the mark_dirty * fails. */ if (ext4_mark_inode_dirty(handle, inode)) /* If that failed, just do the required in-core inode clear. */ ext4_clear_inode(inode); else ext4_free_inode(handle, inode); ext4_journal_stop(handle); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: /* * Check out some where else accidentally dirty the evicting inode, * which may probably cause inode use-after-free issues later. */ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); if (!list_empty(&EXT4_I(inode)->i_fc_list)) ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { return &EXT4_I(inode)->i_reserved_quota; } #endif /* * Called with i_data_sem down, which is important since we can call * ext4_discard_preallocations() from here. */ void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_warning(inode->i_sb, "%s: ino %lu, used %d " "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); WARN_ON(1); used = ei->i_reserved_data_blocks; } /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); spin_unlock(&ei->i_block_reservation_lock); /* Update quota subsystem for data blocks */ if (quota_claim) dquot_claim_block(inode, EXT4_C2B(sbi, used)); else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should * not re-claim the quota for fallocated blocks. */ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); } /* * If we have done all the pending block allocations and if * there aren't any writers on the inode, we can discard the * inode's preallocations. */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) ext4_discard_preallocations(inode); } static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { if (ext4_has_feature_journal(inode->i_sb) && (inode->i_ino == le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) return 0; if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, map->m_pblk, map->m_len); return -EFSCORRUPTED; } return 0; } int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len) { int ret; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) ret = 0; return ret; } #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) #ifdef ES_AGGRESSIVE_TEST static void ext4_map_blocks_es_recheck(handle_t *handle, struct inode *inode, struct ext4_map_blocks *es_map, struct ext4_map_blocks *map, int flags) { int retval; map->m_flags = 0; /* * There is a race window that the result is not the same. * e.g. xfstests #223 when dioread_nolock enables. The reason * is that we lookup a block mapping in extent status tree with * out taking i_data_sem. So at the time the unwritten extent * could be converted. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); /* * We don't check m_len because extent will be collpased in status * tree. So the m_len might not equal. */ if (es_map->m_lblk != map->m_lblk || es_map->m_flags != map->m_flags || es_map->m_pblk != map->m_pblk) { printk("ES cache assertion failed for inode: %lu " "es_cached ex [%d/%d/%llu/%x] != " "found ex [%d/%d/%llu/%x] retval %d flags %x\n", inode->i_ino, es_map->m_lblk, es_map->m_len, es_map->m_pblk, es_map->m_flags, map->m_lblk, map->m_len, map->m_pblk, map->m_flags, retval, flags); } } #endif /* ES_AGGRESSIVE_TEST */ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map) { unsigned int status; int retval; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(handle, inode, map, 0); else retval = ext4_ind_map_blocks(handle, inode, map, 0); if (retval <= 0) return retval; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, 0); return retval; } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; unsigned int status; int err, retval = 0; /* * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE * indicates that the blocks and quotas has already been * checked when the data was copied into the page cache. */ if (map->m_flags & EXT4_MAP_DELAYED) flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; /* * Here we clear m_flags because after allocating an new extent, * it will be set again. */ map->m_flags &= ~EXT4_MAP_FLAGS; /* * We need to check for EXT4 here because migrate could have * changed the inode type in between. */ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags); } else { retval = ext4_ind_map_blocks(handle, inode, map, flags); /* * We allocated new blocks which will result in i_data's * format changing. Force the migrate to fail by clearing * migrate flags. */ if (retval > 0 && map->m_flags & EXT4_MAP_NEW) ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } if (retval <= 0) return retval; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode %lu: " "retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and * use them before they are really zeroed. We also have to * unmap metadata before zeroing as otherwise writeback can * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (err) return err; } /* * If the extent has been zeroed out, we don't need to update * extent status tree. */ if (flags & EXT4_GET_BLOCKS_PRE_IO && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es)) return retval; } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status, flags); return retval; } /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. * * If file type is extents based, it will call ext4_ext_map_blocks(), * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocated. * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are * pre-allocated and unwritten, the resulting @map is marked as unwritten. * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to * indicate the length of a hole starting at map->m_lblk. * * It returns the error in case of allocation failure. */ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; int retval; int ret = 0; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", flags, map->m_len, (unsigned long) map->m_lblk); /* * ext4_map_blocks returns an int, and m_len is an unsigned int */ if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; /* We can handle the block number less than EXT_MAX_BLOCKS */ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; map->m_flags |= ext4_es_is_written(&es) ? EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; map->m_flags |= ext4_es_is_delayed(&es) ? EXT4_MAP_DELAYED : 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; retval = 0; } else { BUG(); } if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return retval; #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif goto found; } /* * In the query cache no-wait mode, nothing we can do more if we * cannot find extent in the cache. */ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return 0; /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); retval = ext4_map_query_blocks(handle, inode, map); up_read((&EXT4_I(inode)->i_data_sem)); found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; } /* If it is only a block(s) look up */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; /* * Returns if the blocks have already allocated * * Note that if blocks have been preallocated * ext4_ext_map_blocks() returns with buffer head unmapped */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* * If we need to convert extent to unwritten * we continue and do the actual work in * ext4_ext_map_blocks() */ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); retval = ext4_map_create_blocks(handle, inode, map, flags); up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; /* * Inodes with freshly allocated blocks where contents will be * visible after transaction commit must be on transaction's * ordered data list. */ if (map->m_flags & EXT4_MAP_NEW && !(map->m_flags & EXT4_MAP_UNWRITTEN) && !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { loff_t start_byte = (loff_t)map->m_lblk << inode->i_blkbits; loff_t length = (loff_t)map->m_len << inode->i_blkbits; if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, start_byte, length); else ret = ext4_jbd2_inode_add_write(handle, inode, start_byte, length); if (ret) return ret; } } if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || map->m_flags & EXT4_MAP_MAPPED)) ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + map->m_len - 1); if (retval < 0) ext_debug(inode, "failed with err %d\n", retval); return retval; } /* * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages * we have to be careful as someone else may be manipulating b_state as well. */ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) { unsigned long old_state; unsigned long new_state; flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ if (!bh->b_page) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } /* * Someone else may be modifying b_state. Be careful! This is ugly but * once we get rid of using bh as a container for mapping information * to pass to / from get_block functions, this can go away. */ old_state = READ_ONCE(bh->b_state); do { new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { struct ext4_map_blocks map; int ret = 0; if (ext4_has_inline_data(inode)) return -ERANGE; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, flags); if (ret > 0) { map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } else if (ret == 0) { /* hole case, need to fill in bh->b_size */ bh->b_size = inode->i_sb->s_blocksize * map.m_len; } return ret; } int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { return _ext4_get_block(inode, iblock, bh, create ? EXT4_GET_BLOCKS_CREATE : 0); } /* * Get block function used when preparing for buffered write if we require * creating an unwritten extent if blocks haven't been allocated. The extent * will be converted to written after the IO is complete. */ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret = 0; ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); /* * If the buffer is marked unwritten, mark it as new to make sure it is * zeroed out correctly in case of partial writes. Otherwise, there is * a chance of stale data getting exposed. */ if (ret == 0 && buffer_unwritten(bh_result)) set_buffer_new(bh_result); return ret; } /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 /* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct ext4_map_blocks map; struct buffer_head *bh; int create = map_flags & EXT4_GET_BLOCKS_CREATE; bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT; int err; ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || handle != NULL || create == 0); ASSERT(create == 0 || !nowait); map.m_lblk = block; map.m_len = 1; err = ext4_map_blocks(handle, inode, &map, map_flags); if (err == 0) return create ? ERR_PTR(-ENOSPC) : NULL; if (err < 0) return ERR_PTR(err); if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); bh = sb_getblk(inode->i_sb, map.m_pblk); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { ASSERT(create != 0); ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || (handle != NULL)); /* * Now that we do not always journal data, we should * keep in mind whether this should always journal the * new buffer as metadata. For now, regular file * writes use ext4_get_block instead, so it's not a * problem. */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (unlikely(err)) { unlock_buffer(bh); goto errout; } if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) goto errout; } else BUFFER_TRACE(bh, "not a new buffer"); return bh; errout: brelse(bh); return ERR_PTR(err); } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct buffer_head *bh; int ret; bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } /* Read a contiguous batch of blocks. */ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, bool wait, struct buffer_head **bhs) { int i, err; for (i = 0; i < bh_count; i++) { bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); if (IS_ERR(bhs[i])) { err = PTR_ERR(bhs[i]); bh_count = i; goto out_brelse; } } for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false); if (!wait) return 0; for (i = 0; i < bh_count; i++) if (bhs[i]) wait_on_buffer(bhs[i]); for (i = 0; i < bh_count; i++) { if (bhs[i] && !buffer_uptodate(bhs[i])) { err = -EIO; goto out_brelse; } } return 0; out_brelse: for (i = 0; i < bh_count; i++) { brelse(bhs[i]); bhs[i] = NULL; } return err; } int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; unsigned blocksize = head->b_size; int err, ret = 0; struct buffer_head *next; for (bh = head, block_start = 0; ret == 0 && (bh != head || !block_start); block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (partial && !buffer_uptodate(bh)) *partial = 1; continue; } err = (*fn)(handle, inode, bh); if (!ret) ret = err; } return ret; } /* * Helper for handling dirtying of journalled data. We also mark the folio as * dirty so that writeback code knows about this page (and inode) contains * dirty data. ext4_writepages() then commits appropriate transaction to * make data stable. */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { folio_mark_dirty(bh->b_folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; BUFFER_TRACE(bh, "get write access"); return ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); } int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; unsigned blocksize = inode->i_sb->s_blocksize; unsigned bbits; struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); bbits = ilog2(blocksize); block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } continue; } if (buffer_new(bh)) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { /* * We may be zeroing partial buffers or all new * buffers in case of failure. Prepare JBD2 for * that. */ if (should_journal_data) do_journal_get_write_access(handle, inode, bh); if (folio_test_uptodate(folio)) { /* * Unlike __block_write_begin() we leave * dirtying of new uptodate buffers to * ->write_end() time or * folio_zero_new_buffers(). */ set_buffer_uptodate(bh); continue; } if (block_end > to || block_start < from) folio_zero_segments(folio, to, block_end, block_start, from); continue; } } if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { ext4_read_bh_lock(bh, 0, false); wait[nr_wait++] = bh; } } /* * If we issued read requests, let them complete. */ for (i = 0; i < nr_wait; i++) { wait_on_buffer(wait[i]); if (!buffer_uptodate(wait[i])) err = -EIO; } if (unlikely(err)) { if (should_journal_data) ext4_journalled_zero_new_buffers(handle, inode, folio, from, to); else folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; err2 = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); err = err2; } } } return err; } /* * To preserve ordering, it is essential that the hole instantiation and * the data write be encapsulated in a single transaction. We cannot * close off a transaction and start a new one between the ext4_get_block() * and the ext4_write_end(). So doing the jbd2_journal_start at the start of * ext4_write_begin() is the right place. */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; handle_t *handle; int retries = 0; struct folio *folio; pgoff_t index; unsigned from, to; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; trace_ext4_write_begin(inode, pos, len); /* * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_SHIFT; from = pos & (PAGE_SIZE - 1); to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, foliop); if (ret < 0) return ret; if (ret == 1) return 0; } /* * __filemap_get_folio() can take a long time if the * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the folio (if needed) without using GFP_NOFS. */ retry_grab: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ if (!folio_buffers(folio)) create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { folio_put(folio); return PTR_ERR(handle); } folio_lock(folio); if (folio->mapping != mapping) { /* The folio got truncated from under us */ folio_unlock(folio); folio_put(folio); ext4_journal_stop(handle); goto retry_grab; } /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block_unwritten); else ret = ext4_block_write_begin(handle, folio, pos, len, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, NULL, do_journal_get_write_access); } if (ret) { bool extended = (pos + len > inode->i_size) && !ext4_verity_in_progress(inode); folio_unlock(folio); /* * ext4_block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * * Add inode to orphan list in case we crash before * truncate finishes */ if (extended && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (extended) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; folio_put(folio); return ret; } *foliop = folio; return ret; } /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); return ret; } /* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * * ext4 never places buffers on inode->i_mapping->i_private_list. metadata * buffers are managed internally. */ static int ext4_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); if (ext4_has_inline_data(inode) && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. * * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree * blocks are being written past EOF, so skip the i_size update. */ if (!verity) i_size_changed = ext4_update_inode_size(inode, pos + copied); folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock * ordering of folio lock and transaction start for journaling * filesystems. */ if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * This is a private version of folio_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_dirty_journalled_data() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct folio *folio, unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; bh = head = folio_buffers(folio); do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { if (!folio_test_uptodate(folio)) { unsigned start, size; start = max(from, block_start); size = min(to, block_end) - start; folio_zero_range(folio, start, size); } clear_buffer_new(bh); write_end_fn(handle, inode, bh); } } block_start = block_end; bh = bh->b_this_page; } while (bh != head); } static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int partial = 0; unsigned from, to; int size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); to = from + len; BUG_ON(!ext4_handle_valid(handle)); if (ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; ext4_journalled_zero_new_buffers(handle, inode, folio, from, to); } else { if (unlikely(copied < len)) ext4_journalled_zero_new_buffers(handle, inode, folio, from + copied, to); ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, from + copied, &partial, write_end_fn); if (!partial) folio_mark_uptodate(folio); } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * Reserve space for 'nr_resv' clusters */ static int ext4_da_reserve_space(struct inode *inode, int nr_resv) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int ret; /* * We will charge metadata quota at writeout time; this saves * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); if (ext4_claim_free_clusters(sbi, nr_resv, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv)); return -ENOSPC; } ei->i_reserved_data_blocks += nr_resv; trace_ext4_da_reserve_space(inode, nr_resv); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ } void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); if (!to_free) return; /* Nothing to release, exit */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); trace_ext4_da_release_space(inode, to_free); if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* * if there aren't enough reserved blocks, then the * counter is messed up somewhere. Since this * function is called from invalidate page, it's * harmless to return without any action. */ ext4_warning(inode->i_sb, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); to_free = ei->i_reserved_data_blocks; } ei->i_reserved_data_blocks -= to_free; /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } /* * Delayed allocation stuff */ struct mpage_da_data { /* These are input fields for ext4_do_writepages() */ struct inode *inode; struct writeback_control *wbc; unsigned int can_map:1; /* Can writepages call map blocks? */ /* These are internal state of ext4_do_writepages() */ pgoff_t first_page; /* The first page to write */ pgoff_t next_page; /* Current page to examine */ pgoff_t last_page; /* Last page to examine */ /* * Extent to map - this can be after first_page because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; unsigned int scanned_until_end:1; unsigned int journalled_more_data:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, bool invalidate) { unsigned nr, i; pgoff_t index, end; struct folio_batch fbatch; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; /* This is necessary when next_page == 0. */ if (mpd->first_page >= mpd->next_page) return; mpd->scanned_until_end = 0; index = mpd->first_page; end = mpd->next_page - 1; if (invalidate) { ext4_lblk_t start, last; start = index << (PAGE_SHIFT - inode->i_blkbits); last = end << (PAGE_SHIFT - inode->i_blkbits); /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); ext4_es_remove_extent(inode, start, last - start + 1); up_write(&EXT4_I(inode)->i_data_sem); } folio_batch_init(&fbatch); while (index <= end) { nr = filemap_get_folios(mapping, &index, end, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; if (folio->index < mpd->first_page) continue; if (folio_next_index(folio) - 1 > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (invalidate) { if (folio_mapped(folio)) folio_clear_dirty_for_io(folio); block_invalidate_folio(folio, 0, folio_size(folio)); folio_clear_uptodate(folio); } folio_unlock(folio); } folio_batch_release(&fbatch); } } static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), ext4_count_free_clusters(sb))); ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", ei->i_reserved_data_blocks); return; } /* * Check whether the cluster containing lblk has been allocated or has * delalloc reservation. * * Returns 0 if the cluster doesn't have either, 1 if it has delalloc * reservation, 2 if it's already been allocated, negative error code on * failure. */ static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; /* Has delalloc reservation? */ if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk)) return 1; /* Already been allocated? */ if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) return 2; ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); if (ret < 0) return ret; if (ret > 0) return 2; return 0; } /* * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents * status tree, incrementing the reserved * cluster/block count or making pending * reservations where needed * * @inode - file containing the newly added block * @lblk - start logical block to be added * @len - length of blocks to be added * * Returns 0 on success, negative error code on failure. */ static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool lclu_allocated = false; bool end_allocated = false; ext4_lblk_t resv_clu; ext4_lblk_t end = lblk + len - 1; /* * If the cluster containing lblk or end is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's * shared with a written or unwritten extent and doesn't already * have one. Written and unwritten extents can be purged from the * extents status tree if the system is under memory pressure, so * it's necessary to examine the extent tree if a search of the * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { ret = ext4_da_reserve_space(inode, len); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1; ret = ext4_clu_alloc_state(inode, lblk); if (ret < 0) return ret; if (ret > 0) { resv_clu--; lclu_allocated = (ret == 2); } if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) { ret = ext4_clu_alloc_state(inode, end); if (ret < 0) return ret; if (ret > 0) { resv_clu--; end_allocated = (ret == 2); } } if (resv_clu) { ret = ext4_da_reserve_space(inode, resv_clu); if (ret != 0) /* ENOSPC */ return ret; } } ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated, end_allocated); return 0; } /* * Looks up the requested blocks and sets the delalloc extent map. * First try to look up for the extent entry that contains the requested * blocks in the extent status tree without i_data_sem, then try to look * up for the ondisk extent mapping with i_data_sem in read mode, * finally hold i_data_sem in write mode, looks up again and add a * delalloc extent entry if it still couldn't find any extent. Pass out * the mapped extent through @map and return 0 on success. */ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; int retval; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); if (ext4_es_is_hole(&es)) goto add_delayed; found: /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ if (ext4_es_is_delayed(&es)) { map->m_flags |= EXT4_MAP_DELAYED; return 0; } map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) map->m_flags |= EXT4_MAP_UNWRITTEN; else BUG(); #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif return 0; } /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; else retval = ext4_map_query_blocks(NULL, inode, map); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval < 0 ? retval : 0; add_delayed: down_write(&EXT4_I(inode)->i_data_sem); /* * Page fault path (ext4_page_mkwrite does not take i_rwsem) * and fallocate path (no folio lock) can race. Make sure we * lookup the extent status tree here again while i_data_sem * is held in write mode, before inserting a new da entry in * the extent status tree. */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, es.es_len - (map->m_lblk - es.es_lblk)); if (!ext4_es_is_hole(&es)) { up_write(&EXT4_I(inode)->i_data_sem); goto found; } } else if (!ext4_has_inline_data(inode)) { retval = ext4_map_query_blocks(NULL, inode, map); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval < 0 ? retval : 0; } } map->m_flags |= EXT4_MAP_DELAYED; retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); up_write(&EXT4_I(inode)->i_data_sem); return retval; } /* * This is a special get_block_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. * * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. * We also have b_blocknr = -1 and b_bdev initialized properly * * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; sector_t invalid_block = ~((sector_t) 0xffff); int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; map.m_lblk = iblock; map.m_len = 1; /* * first, we need to know whether the block is allocated already * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ ret = ext4_da_map_blocks(inode, &map); if (ret < 0) return ret; if (map.m_flags & EXT4_MAP_DELAYED) { map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); return 0; } map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); if (buffer_unwritten(bh)) { /* A delayed write to unwritten bh should be marked * new and mapped. Mapped ensures that we don't do * get_block multiple times when we write to the same * offset and new ensures that we do proper zero out * for partial write. */ set_buffer_new(bh); set_buffer_mapped(bh); } return 0; } static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { mpd->first_page += folio_nr_pages(folio); folio_unlock(folio); } static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) { size_t len; loff_t size; int err; BUG_ON(folio->index != mpd->first_page); folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path * against i_size changes and the page can be writeably mapped into * page tables. So an application can be growing i_size and writing * data through mmap while writeback runs. folio_clear_dirty_for_io() * write-protects our page in page tables and the page cannot get * written to again until we release folio lock. So only after * folio_clear_dirty_for_io() we are safe to sample i_size for * ext4_bio_write_folio() to zero-out tail of the written page. We rely * on the barrier provided by folio_test_clear_dirty() in * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ size = i_size_read(mpd->inode); len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; return err; } #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) /* * mballoc gives us at most this number of blocks... * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). * The rest of mballoc seems to handle chunks up to full group size. */ #define MAX_WRITEPAGES_EXTENT_LEN 2048 /* * mpage_add_bh_to_extent - try to add bh to extent of blocks to map * * @mpd - extent of blocks * @lblk - logical number of the block in the file * @bh - buffer head we want to add to the extent * * The function is used to collect contig. blocks in the same state. If the * buffer doesn't require mapping for writeback and we haven't started the * extent of buffers to map yet, the function returns 'true' immediately - the * caller can write the buffer right away. Otherwise the function returns true * if the block has been added to the extent, false if the block couldn't be * added. */ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, struct buffer_head *bh) { struct ext4_map_blocks *map = &mpd->map; /* Buffer that doesn't need mapping for writeback? */ if (!buffer_dirty(bh) || !buffer_mapped(bh) || (!buffer_delay(bh) && !buffer_unwritten(bh))) { /* So far no extent to map => we write the buffer right away */ if (map->m_len == 0) return true; return false; } /* First block in the extent? */ if (map->m_len == 0) { /* We cannot map unless handle is started... */ if (!mpd->do_map) return false; map->m_lblk = lblk; map->m_len = 1; map->m_flags = bh->b_state & BH_FLAGS; return true; } /* Don't go larger than mballoc is willing to allocate */ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) return false; /* Can we merge the block to our big extent? */ if (lblk == map->m_lblk + map->m_len && (bh->b_state & BH_FLAGS) == map->m_flags) { map->m_len++; return true; } return false; } /* * mpage_process_page_bufs - submit page buffers for IO or add them to extent * * @mpd - extent of blocks for mapping * @head - the first buffer in the page * @bh - buffer we should start processing from * @lblk - logical number of the block in the file corresponding to @bh * * Walk through page buffers from @bh upto @head (exclusive) and either submit * the page for IO if all buffers in this page were mapped and there's no * accumulated extent of buffers to map or add buffers in the page to the * extent of buffers to map. The function returns 1 if the caller can continue * by processing the next page, 0 if it should stop adding buffers to the * extent to map because we cannot extend it anymore. It can also return value * < 0 in case of error during IO submission. */ static int mpage_process_page_bufs(struct mpage_da_data *mpd, struct buffer_head *head, struct buffer_head *bh, ext4_lblk_t lblk) { struct inode *inode = mpd->inode; int err; ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; if (ext4_verity_in_progress(inode)) blocks = EXT_MAX_BLOCKS; do { BUG_ON(buffer_locked(bh)); if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { /* Found extent to map? */ if (mpd->map.m_len) return 0; /* Buffer needs mapping and handle is not started? */ if (!mpd->do_map) return 0; /* Everything mapped so far and we hit EOF */ break; } } while (lblk++, (bh = bh->b_this_page) != head); /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; mpage_folio_done(mpd, head->b_folio); } if (lblk >= blocks) { mpd->scanned_until_end = 1; return 0; } return 1; } /* * mpage_process_folio - update folio buffers corresponding to changed extent * and may submit fully mapped page for IO * @mpd: description of extent to map, on return next extent to map * @folio: Contains these buffers. * @m_lblk: logical block mapping. * @m_pblk: corresponding physical mapping. * @map_bh: determines on return whether this page requires any further * mapping or not. * * Scan given folio buffers corresponding to changed extent and update buffer * state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits. * If the given folio is not fully mapped, we update @mpd to the next extent in * the given folio that needs mapping & return @map_bh as true. */ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, bool *map_bh) { struct buffer_head *head, *bh; ext4_io_end_t *io_end = mpd->io_submit.io_end; ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); bh = head = folio_buffers(folio); do { if (lblk < mpd->map.m_lblk) continue; if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { /* * Buffer after end of mapped extent. * Find next buffer in the folio to map. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; io_end_vec->size += io_end_size; err = mpage_process_page_bufs(mpd, head, bh, lblk); if (err > 0) err = 0; if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) { err = PTR_ERR(io_end_vec); goto out; } io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; } *map_bh = true; goto out; } if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); io_end_size += (1 << blkbits); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; *map_bh = false; out: *m_lblk = lblk; *m_pblk = pblock; return err; } /* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO * * @mpd - description of extent to map, on return next extent to map * * Scan buffers corresponding to changed extent (we expect corresponding pages * to be already locked) and update buffer state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits, * and mark buffers as uninit when we perform writes to unwritten extents * and do extent conversion after IO is finished. If the last page is not fully * mapped, we update @map to the next extent in the last page that needs * mapping. Otherwise we submit the page for IO. */ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) { struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; lblk = start << bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); while (start <= end) { nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* * If map_bh is true, means page may require further bh * mapping, or maybe the page was submitted for IO. * So we return to call further extent mapping. */ if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; mpage_folio_done(mpd, folio); } folio_batch_release(&fbatch); } /* Extent fully mapped and matches with page boundary. We are done. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; out: folio_batch_release(&fbatch); return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int get_blocks_flags; int err, dioread_nolock; trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or * to convert an unwritten extent to be initialized (in the case * where we have written into one or more preallocated blocks). It is * possible that we're going to need more metadata blocks than * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if * possible. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_GET_BLOCKS_IO_SUBMIT; dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) return err; if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { if (!mpd->io_submit.io_end->handle && ext4_handle_valid(handle)) { mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); return 0; } /* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO * * @handle - handle for journal operations * @mpd - extent to map * @give_up_on_write - we set this to true iff there is a fatal error and there * is no hope of writing the data. The caller should discard * dirty pages to avoid infinite loops. * * The function maps extent starting at mpd->lblk of length mpd->len. If it is * delayed, blocks are allocated, if it is unwritten, we may need to convert * them to initialized or split the described range from larger unwritten * extent. Note that we need not map all the described range since allocation * can return less blocks or the range is covered by more unwritten extents. We * cannot map more because we are limited by reserved transaction credits. On * the other hand we always make sure that the last touched page is fully * mapped so that it can be written out (and thus forward progress is * guaranteed). After mapping we submit all mapped pages for IO. */ static int mpage_map_and_submit_extent(handle_t *handle, struct mpage_da_data *mpd, bool *give_up_on_write) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int err; loff_t disksize; int progress = 0; ext4_io_end_t *io_end = mpd->io_submit.io_end; struct ext4_io_end_vec *io_end_vec; io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { struct super_block *sb = inode->i_sb; if (ext4_forced_shutdown(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ if ((err == -ENOMEM) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { if (progress) goto update_disksize; return err; } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, (unsigned)map->m_len, -err); ext4_msg(sb, KERN_CRIT, "This should not happen!! Data will " "be lost\n"); if (err == -ENOSPC) ext4_print_free_blocks(inode); invalidate_dirty_pages: *give_up_on_write = true; return err; } progress = 1; /* * Update buffer state, submit mapped pages, and get us new * extent to map */ err = mpage_map_and_submit_buffers(mpd); if (err < 0) goto update_disksize; } while (map->m_len); update_disksize: /* * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; down_write(&EXT4_I(inode)->i_data_sem); i_size = i_size_read(inode); if (disksize > i_size) disksize = i_size; if (disksize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) { ext4_error_err(inode->i_sb, -err2, "Failed to mark inode %lu dirty", inode->i_ino); } if (!err) err = err2; } return err; } /* * Calculate the total number of credits to reserve for one writepages * iteration. This is called from ext4_writepages(). We map an extent of * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + * bpp - 1 blocks in bpp different extents. */ static int ext4_da_writepages_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); return ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, size_t len) { struct buffer_head *page_bufs = folio_buffers(folio); struct inode *inode = folio->mapping->host; int ret, err; ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, do_journal_get_write_access); err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, write_end_fn); if (ret == 0) ret = err; err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; return ret; } static int mpage_journal_page_buffers(handle_t *handle, struct mpage_da_data *mpd, struct folio *folio) { struct inode *inode = mpd->inode; loff_t size = i_size_read(inode); size_t len = folio_size(folio); folio_clear_checked(folio); mpd->wbc->nr_to_write--; if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) len = size & (len - 1); return ext4_journal_folio_buffers(handle, folio, len); } /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages * needing mapping, submit mapped pages * * @mpd - where to look for pages * * Walk dirty pages in the mapping. If they are fully mapped, submit them for * IO immediately. If we cannot map blocks, we submit just already mapped * buffers in the page for IO and keep page dirty. When we can map blocks and * we find a page which isn't mapped we start accumulating extent of buffers * underlying these pages that needs mapping (formed by either delayed or * unwritten buffers). We also lock the pages containing these buffers. The * extent found is returned in @mpd structure (starting at mpd->lblk with * length mpd->len blocks). * * Note that this function can attach bios to one io_end structure which are * neither logically nor physically contiguous. Although it may seem as an * unnecessary complication, it is actually inevitable in blocksize < pagesize * case as we need to track IO to all buffers underlying a page in one io_end. */ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_page(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; mpd->map.m_len = 0; mpd->next_page = index; if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); if (IS_ERR(handle)) return PTR_ERR(handle); } folio_batch_init(&fbatch); while (index <= end) { nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); if (nr_folios == 0) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to * keep going because someone may be concurrently * dirtying pages, and we might have synced a lot of * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= mpd->map.m_len >> (PAGE_SHIFT - blkbits)) goto out; /* If we can't merge this page, we are done. */ if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; if (handle) { err = ext4_journal_ensure_credits(handle, bpp, 0); if (err < 0) goto out; } folio_lock(folio); /* * If the page is no longer dirty, or its mapping no * longer corresponds to inode we are writing (which * means it has been truncated or invalidated), or the * page is already under writeback and we are not doing * a data integrity writeback, skip the page */ if (!folio_test_dirty(folio) || (folio_test_writeback(folio) && (mpd->wbc->sync_mode == WB_SYNC_NONE)) || unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } folio_wait_writeback(folio); BUG_ON(folio_test_writeback(folio)); /* * Should never happen but for buggy code in * other subsystems that call * set_page_dirty() without properly warning * the file system first. See [1] for more * information. * * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz */ if (!folio_buffers(folio)) { ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); folio_clear_dirty(folio); folio_unlock(folio); continue; } if (mpd->map.m_len == 0) mpd->first_page = folio->index; mpd->next_page = folio_next_index(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we * first handle writeout of the page for checkpoint and * only after that handle delayed page dirtying. This * makes sure current data is checkpointed to the final * location before possibly journalling it again which * is desirable when the page is frequently dirtied * through a pin. */ if (!mpd->can_map) { err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { err = mpage_journal_page_buffers(handle, mpd, folio); if (err < 0) goto out; mpd->journalled_more_data = 1; } mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << (PAGE_SHIFT - blkbits); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); if (err <= 0) goto out; err = 0; } } folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; if (handle) ext4_journal_stop(handle); return 0; out: folio_batch_release(&fbatch); if (handle) ext4_journal_stop(handle); return err; } static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; pgoff_t writeback_index = 0; long nr_to_write = wbc->nr_to_write; int range_whole = 0; int cycled = 1; handle_t *handle = NULL; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); struct blk_plug plug; bool give_up_on_write = false; trace_ext4_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() * because that could violate lock ordering on umount */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { ret = -EROFS; goto out_writepages; } /* * If we have inline data and arrive here, it means that * we will soon create the block for the 1st page, so * we'd better clear the inline data here. */ if (ext4_has_inline_data(inode)) { /* Just inode will be modified... */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_writepages; } BUG_ON(ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)); ext4_destroy_inline_data(handle, inode); ext4_journal_stop(handle); } /* * data=journal mode does not do delalloc so we just need to writeout / * journal already mapped buffers. On the other hand we need to commit * transaction to make data stable. We expect all the data to be * already in the journal (the only exception are DMA pinned pages * dirtied behind our back) so we commit transaction here and run the * writeback loop to checkpoint them. The checkpointing is not actually * necessary to make data persistent *but* quite a few places (extent * shifting operations, fsverity, ...) depend on being able to drop * pagecache pages after calling filemap_write_and_wait() and for that * checkpointing needs to happen. */ if (ext4_should_journal_data(inode)) { mpd->can_map = 0; if (wbc->sync_mode == WB_SYNC_ALL) ext4_fc_commit(sbi->s_journal, EXT4_I(inode)->i_datasync_tid); } mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { /* * We may need to convert up to one extent per block in * the page and we may dirty the inode. */ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, PAGE_SIZE >> inode->i_blkbits); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; mpd->first_page = writeback_index; mpd->last_page = -1; } else { mpd->first_page = wbc->range_start >> PAGE_SHIFT; mpd->last_page = wbc->range_end >> PAGE_SHIFT; } ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, mpd->first_page, mpd->last_page); blk_start_plug(&plug); /* * First writeback pages that don't need mapping - we can avoid * starting a transaction unnecessarily and also avoid being blocked * in the block layer on device congestion while having transaction * started. */ mpd->do_map = 0; mpd->scanned_until_end = 0; mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; goto unplug; } ret = mpage_prepare_extent_to_map(mpd); /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, false); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); ext4_put_io_end_defer(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; if (ret < 0) goto unplug; while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; break; } WARN_ON_ONCE(!mpd->can_map); /* * We have two constraints: We find one extent to map and we * must always write out whole page (makes a difference when * blocksize < pagesize) so that we don't block on IO when we * try to write out the rest of the page. Journalled mode is * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); needed_blocks = ext4_da_writepages_trans_blocks(inode); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; break; } mpd->do_map = 1; trace_ext4_da_write_pages(inode, mpd->first_page, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, &give_up_on_write); /* * Caution: If the handle is synchronous, * ext4_journal_stop() can wait for transaction commit * to finish which may depend on writeback of pages to * complete or on page lock to be released. In that * case, we have to wait until after we have * submitted all the IO, released page locks we hold, * and dropped io_end reference (for extent conversion * to be able to complete) before stopping the handle. */ if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; mpd->do_map = 0; } /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, give_up_on_write); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); /* * Drop our io_end reference we got from init. We have * to be careful and use deferred io_end finishing if * we are still holding the transaction as we can * release the last reference to io_end which may end * up doing unwritten extent conversion. */ if (handle) { ext4_put_io_end_defer(mpd->io_submit.io_end); ext4_journal_stop(handle); } else ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* * Commit the transaction which would * free blocks released in the transaction * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); ret = 0; continue; } /* Fatal error - ENOMEM, EIO... */ if (ret) break; } unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; mpd->last_page = writeback_index - 1; mpd->first_page = 0; goto retry; } /* Update index */ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * Set the writeback_index so that range_cyclic * mode will write it back later */ mapping->writeback_index = mpd->first_page; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); return ret; } static int ext4_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct super_block *sb = mapping->host->i_sb; struct mpage_da_data mpd = { .inode = mapping->host, .wbc = wbc, .can_map = 1, }; int ret; int alloc_ctx; if (unlikely(ext4_forced_shutdown(sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); /* * For data=journal writeback we could have come across pages marked * for delayed dirtying (PageChecked) which were just added to the * running transaction. Try once more to get them to stable storage. */ if (!ret && mpd.journalled_more_data) ret = ext4_do_writepages(&mpd); ext4_writepages_up_read(sb, alloc_ctx); return ret; } int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; struct mpage_da_data mpd = { .inode = jinode->i_vfs_inode, .wbc = &wbc, .can_map = 0, }; return ext4_do_writepages(&mpd); } static int ext4_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; int alloc_ctx; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); ret = dax_writeback_mapping_range(mapping, EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; } static int ext4_nonda_switch(struct super_block *sb) { s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * switch to non delalloc mode if we are running low * on free block. The free block accounting via percpu * counters can get slightly wrong with percpu_counter_batch getting * accumulated on each CPU without updating global counters * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ free_clusters = percpu_counter_read_positive(&sbi->s_freeclusters_counter); dirty_clusters = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); if (2 * free_clusters < 3 * dirty_clusters || free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark */ return 1; } return 0; } static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { int ret, retries = 0; struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; index = pos >> PAGE_SHIFT; if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, foliop, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, foliop, fsdata); if (ret < 0) return ret; if (ret == 1) return 0; } retry: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); if (ret < 0) { folio_unlock(folio); folio_put(folio); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } *foliop = folio; return ret; } /* * Check if we should update i_disksize * when write to the end of file but not require block allocation */ static int ext4_da_should_update_i_disksize(struct folio *folio, unsigned long offset) { struct buffer_head *bh; struct inode *inode = folio->mapping->host; unsigned int idx; int i; bh = folio_buffers(folio); idx = offset >> inode->i_blkbits; for (i = 0; i < idx; i++) bh = bh->b_this_page; if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) return 0; return 1; } static int ext4_da_do_write_end(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; loff_t new_i_size; if (unlikely(!folio_buffers(folio))) { folio_unlock(folio); folio_put(folio); return -EIO; } /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(NULL, mapping, pos, len, copied, folio, NULL); new_i_size = pos + copied; /* * It's important to update i_size while still holding folio lock, * because folio writeout could otherwise come in and zero beyond * i_size. * * Since we are holding inode lock, we are sure i_disksize <= * i_size. We also know that if i_disksize < i_size, there are * delalloc writes pending in the range up to i_size. If the end of * the current write is <= i_size, there's no need to touch * i_disksize since writeback will push i_disksize up to i_size * eventually. If the end of the current write is > i_size and * inside an allocated block which ext4_da_should_update_i_disksize() * checked, we need to update i_disksize here as certain * ext4_writepages() paths not allocating blocks and update i_disksize. */ if (new_i_size > inode->i_size) { unsigned long end; i_size_write(inode, new_i_size); end = (new_i_size - 1) & (PAGE_SIZE - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; } } folio_unlock(folio); folio_put(folio); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); if (disksize_changed) { handle_t *handle; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); } return copied; } static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, len, copied, folio, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; return ext4_da_do_write_end(mapping, pos, len, copied, folio); } /* * Force all delayed allocation blocks to be allocated for a given inode. */ int ext4_alloc_da_blocks(struct inode *inode) { trace_ext4_alloc_da_blocks(inode); if (!EXT4_I(inode)->i_reserved_data_blocks) return 0; /* * We do something simple for now. The filemap_flush() will * also start triggering a write of the data blocks, which is * not strictly speaking necessary (and for users of * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: * * ext4_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> * mpage_add_bh_to_extent() * mpage_da_map_blocks() * * The problem is that write_cache_pages(), located in * mm/page-writeback.c, marks pages clean in preparation for * doing I/O, which is not desirable if we're not planning on * doing I/O at all. * * We could call write_cache_pages(), and then redirty all of * the pages by calling redirty_page_for_writepage() but that * would be ugly in the extreme. So instead we would need to * replicate parts of the code in the above functions, * simplifying them because we wouldn't actually intend to * write out the pages, but rather only collect contiguous * logical block extents, call the multi-block allocator, and * then update the buffer heads with the block allocations. * * For now, though, we'll cheat by calling filemap_flush(), * which will map the blocks, and start the I/O, but not * actually wait for the I/O to complete. */ return filemap_flush(inode->i_mapping); } /* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * * Naturally, this is dangerous if the block concerned is still in the * journal. If somebody makes a swapfile on an ext4 data-journaling * filesystem and enables swap, then they may get a nasty shock when the * data getting swapped to that swapfile suddenly gets overwritten by * the original zero's written out previously to the journal and * awaiting writeback in the kernel's buffer cache. * * So, if we see any bmap calls here on a modified, data-journaled file, * take extra steps to flush any blocks which might be in the cache. */ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; sector_t ret = 0; inode_lock_shared(inode); /* * We can get here for an inline file via the FIBMAP ioctl */ if (ext4_has_inline_data(inode)) goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && (test_opt(inode->i_sb, DELALLOC) || ext4_should_journal_data(inode))) { /* * With delalloc or journalled data we want to sync the file so * that we can make sure we allocate blocks for file and data * is in place for the user to see it */ filemap_write_and_wait(mapping); } ret = iomap_bmap(mapping, block, &ext4_iomap_ops); out: inode_unlock_shared(inode); return ret; } static int ext4_read_folio(struct file *file, struct folio *folio) { int ret = -EAGAIN; struct inode *inode = folio->mapping->host; trace_ext4_read_folio(inode, folio); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) return ext4_mpage_readpages(inode, NULL, folio); return ret; } static void ext4_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) return; ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidate_folio(struct folio *folio, size_t offset, size_t length) { trace_ext4_invalidate_folio(folio, offset, length); /* No journalling happens on data buffers when this function is used */ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio))); block_invalidate_folio(folio, offset, length); } static int __ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { journal_t *journal = EXT4_JOURNAL(folio->mapping->host); trace_ext4_journalled_invalidate_folio(folio, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0 && length == folio_size(folio)) folio_clear_checked(folio); return jbd2_journal_invalidate_folio(journal, folio, offset, length); } /* Wrapper for aops... */ static void ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0); } static bool ext4_release_folio(struct folio *folio, gfp_t wait) { struct inode *inode = folio->mapping->host; journal_t *journal = EXT4_JOURNAL(inode); trace_ext4_release_folio(inode, folio); /* Page has dirty journalled data -> cannot release */ if (folio_test_checked(folio)) return false; if (journal) return jbd2_journal_try_to_free_buffers(journal, folio); else return try_to_free_buffers(folio); } static bool ext4_inode_datasync_dirty(struct inode *inode) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal) { if (jbd2_transaction_committed(journal, EXT4_I(inode)->i_datasync_tid)) return false; if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) return !list_empty(&EXT4_I(inode)->i_fc_list); return true; } /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->i_private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, struct ext4_map_blocks *map, loff_t offset, loff_t length, unsigned int flags) { u8 blkbits = inode->i_blkbits; /* * Writes that span EOF might trigger an I/O size update on completion, * so consider them to be dirty for the purpose of O_DSYNC, even if * there is no other metadata changes being made or are pending. */ iomap->flags = 0; if (ext4_inode_datasync_dirty(inode) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64) map->m_lblk << blkbits; iomap->length = (u64) map->m_len << blkbits; if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) iomap->flags |= IOMAP_F_MERGED; /* * Flags passed to ext4_map_blocks() for direct I/O writes can result * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits * set. In order for any allocated unwritten extents to be converted * into written extents correctly within the ->end_io() handler, we * need to ensure that the iomap->type is set appropriately. Hence, the * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has * been set first. */ if (map->m_flags & EXT4_MAP_UNWRITTEN) { iomap->type = IOMAP_UNWRITTEN; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_DELAYED) { iomap->type = IOMAP_DELALLOC; iomap->addr = IOMAP_NULL_ADDR; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; } } static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; /* * Trim the mapping request to the maximum value that we can map at * once for direct I/O. */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); retry: /* * Either we allocate blocks and then don't get an unwritten extent, so * in that case we have reserved enough credits. Or, the blocks are * already allocated and unwritten. In that case, the extent conversion * fits into the credits as well. */ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); if (IS_ERR(handle)) return PTR_ERR(handle); /* * DAX and direct I/O are the only two operations that are currently * supported with IOMAP_WRITE. */ WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); if (flags & IOMAP_DAX) m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; /* * We use i_size instead of i_disksize here because delalloc writeback * can complete at any point during the I/O and subsequently push the * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could * expose stale data in the case of a crash. Use the magic error code * to fallback to buffered I/O. */ if (!m_flags && !ret) ret = -ENOTBLK; ext4_journal_stop(handle); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (WARN_ON_ONCE(ext4_has_inline_data(inode))) return -ERANGE; /* * Calculate the first and last logical blocks respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; if (flags & IOMAP_WRITE) { /* * We check here if the blocks are already allocated, then we * don't need to start a journal txn and we can directly return * the mapping information. This could boost performance * especially in multi-threaded overwrite requests. */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) goto out; } ret = ext4_iomap_alloc(inode, &map, flags); } else { ret = ext4_map_blocks(NULL, inode, &map, 0); } if (ret < 0) return ret; out: /* * When inline encryption is enabled, sometimes I/O to an encrypted file * has to be broken up to guarantee DUN contiguity. Handle this by * limiting the length of the mapping returned. */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; /* * Even for writes we don't need to allocate blocks, so just pretend * we are reading to save overhead of starting a transaction. */ flags &= ~IOMAP_WRITE; ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); return ret; } static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to * the allocated blocks. If so, return the magic error code so that we * fallback to buffered I/O and attempt to complete the remainder of * the I/O. Any blocks that may have been allocated in preparation for * the direct I/O will be reused during buffered I/O. */ if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) return -ENOTBLK; return 0; } const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, .iomap_end = ext4_iomap_end, }; const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_begin = ext4_iomap_overwrite_begin, .iomap_end = ext4_iomap_end, }; static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (ext4_has_inline_data(inode)) { ret = ext4_inline_data_iomap(inode, iomap); if (ret != -EAGAIN) { if (ret == 0 && offset >= iomap->length) ret = -ENOENT; return ret; } } /* * Calculate the first and last logical block respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; /* * Fiemap callers may call for offset beyond s_bitmap_maxbytes. * So handle it here itself instead of querying ext4_map_blocks(). * Since ext4_map_blocks() will warn about it and will return * -EIO error. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (offset >= sbi->s_bitmap_maxbytes) { map.m_flags = 0; goto set_iomap; } } ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } const struct iomap_ops ext4_iomap_report_ops = { .iomap_begin = ext4_iomap_begin_report, }; /* * For data=journal mode, folio should be marked dirty only when it was * writeably mapped. When that happens, it was already attached to the * transaction and marked as jbddirty (we take care of this in * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings * so we should have nothing to do here, except for the case when someone * had the page pinned and dirtied the page through this pin (e.g. by doing * direct IO to it). In that case we'd need to attach buffers here to the * transaction but we cannot due to lock ordering. We cannot just dirty the * folio and leave attached buffers clean, because the buffers' dirty state is * "definitive". We cannot just set the buffers dirty or jbddirty because all * the journalling code will explode. So what we do is to mark the folio * "pending dirty" and next time ext4_writepages() is called, attach buffers * to the transaction appropriately. */ static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_buffers(folio)); if (folio_maybe_dma_pinned(folio)) folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio)); WARN_ON_ONCE(!folio_buffers(folio)); return block_dirty_folio(mapping, folio); } static int ext4_iomap_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return iomap_swapfile_activate(sis, file, span, &ext4_iomap_report_ops); } static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_journalled_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, .dirty_folio = ext4_journalled_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_da_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, }; void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: case EXT4_INODE_WRITEBACK_DATA_MODE: break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; return; default: BUG(); } if (IS_DAX(inode)) inode->i_mapping->a_ops = &ext4_dax_aops; else if (test_opt(inode->i_sb, DELALLOC)) inode->i_mapping->a_ops = &ext4_da_aops; else inode->i_mapping->a_ops = &ext4_aops; } /* * Here we can't skip an unwritten buffer even though it usually reads zero * because it might have data in pagecache (eg, if called from ext4_zero_range, * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a * racing writeback can come later and flush the stale pagecache to disk. */ static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_SHIFT; unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; struct folio *folio; int err = 0; folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (IS_ERR(folio)) return PTR_ERR(folio); blocksize = inode->i_sb->s_blocksize; iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; iblock++; pos += blocksize; } if (buffer_freed(bh)) { BUFFER_TRACE(bh, "freed: skip"); goto unlock; } if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "unmapped"); ext4_get_block(inode, iblock, bh, 0); /* unmapped? It's a hole - nothing to do */ if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "still unmapped"); goto unlock; } } /* Ok, it's mapped. Make sure it's up-to-date */ if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { err = ext4_read_bh_lock(bh, 0, true); if (err) goto unlock; if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); err = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(bh)); if (err) { clear_buffer_uptodate(bh); goto unlock; } } } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto unlock; } folio_zero_range(folio, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { err = ext4_dirty_journalled_data(handle, bh); } else { err = 0; mark_buffer_dirty(bh); if (ext4_should_order_data(inode)) err = ext4_jbd2_inode_add_write(handle, inode, from, length); } unlock: folio_unlock(folio); folio_put(folio); return err; } /* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block * that corresponds to 'from' */ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; unsigned max = blocksize - (offset & (blocksize - 1)); /* * correct length if it does not fall between * 'from' and the end of the block */ if (length > max || length < 0) length = max; if (IS_DAX(inode)) { return dax_zero_range(inode, from, length, NULL, &ext4_iomap_ops); } return __ext4_block_zero_page_range(handle, mapping, from, length); } /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; /* If we are processing an encrypted inode during orphan list handling */ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned partial_start, partial_end; ext4_fsblk_t start, end; loff_t byte_end = (lstart + length - 1); int err = 0; partial_start = lstart & (sb->s_blocksize - 1); partial_end = byte_end & (sb->s_blocksize - 1); start = lstart >> sb->s_blocksize_bits; end = byte_end >> sb->s_blocksize_bits; /* Handle partial zero within the single block */ if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { err = ext4_block_zero_page_range(handle, mapping, lstart, length); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { err = ext4_block_zero_page_range(handle, mapping, lstart, sb->s_blocksize); if (err) return err; } /* Handle partial zero out on the end of the range */ if (partial_end != sb->s_blocksize - 1) err = ext4_block_zero_page_range(handle, mapping, byte_end - partial_end, partial_end + 1); return err; } int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) return 1; if (S_ISLNK(inode->i_mode)) return !ext4_inode_is_fast_symlink(inode); return 0; } /* * We have to make sure i_disksize gets properly updated before we truncate * page cache due to hole punching or zero range. Otherwise i_disksize update * can get lost as it may have been postponed to submission of writeback but * that will never happen after we truncate page cache. */ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) { handle_t *handle; int ret; loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); if (offset > size || offset + len < size) return 0; if (EXT4_I(inode)->i_disksize >= size) return 0; handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_update_i_disksize(inode, size); ret = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); return ret; } static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); schedule(); filemap_invalidate_lock(inode->i_mapping); } int ext4_break_layouts(struct inode *inode) { struct page *page; int error; if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; do { page = dax_layout_busy_page(inode->i_mapping); if (!page) return 0; error = ___wait_var_event(&page->_refcount, atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 0, 0, ext4_wait_dax_page(inode)); } while (error == 0); return error; } /* * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode * @offset: The offset where the hole will begin * @len: The length of the hole * * Returns: 0 on success or negative on failure */ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; loff_t first_block_offset, last_block_offset, max_length; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; unsigned int credits; int ret = 0, ret2 = 0; trace_ext4_punch_hole(inode, offset, length, 0); /* * Write out all dirty pages to avoid race conditions * Then release them. */ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ret = filemap_write_and_wait_range(mapping, offset, offset + length - 1); if (ret) return ret; } inode_lock(inode); /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) goto out_mutex; /* * If the hole extends beyond i_size, set the hole * to end after the page that contains i_size */ if (offset + length > inode->i_size) { length = inode->i_size + PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - offset; } /* * For punch hole the length + offset needs to be within one block * before last range. Adjust the length if it goes beyond that limit. */ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; if (offset + length > max_length) length = max_length - offset; if (offset & (sb->s_blocksize - 1) || (offset + length) & (sb->s_blocksize - 1)) { /* * Attach jinode to inode for jbd2 if we do any zeroing of * partial block */ ret = ext4_inode_attach_jinode(inode); if (ret < 0) goto out_mutex; } /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* * Prevent page faults from reinstantiating pages we have released from * page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) goto out_dio; first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; /* Now release the pages and zero block aligned part of pages*/ if (last_block_offset > first_block_offset) { ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) goto out_dio; truncate_pagecache_range(inode, first_block_offset, last_block_offset); } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); goto out_dio; } ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) goto out_stop; first_block = (offset + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); /* If there are blocks to remove, do it */ if (stop_block > first_block) { ext4_lblk_t hole_len = stop_block - first_block; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, first_block, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, first_block, stop_block - 1); else ret = ext4_ind_remove_space(handle, inode, first_block, stop_block); ext4_es_insert_extent(inode, first_block, hole_len, ~0, EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, first_block, stop_block); if (IS_SYNC(inode)) ext4_handle_sync(handle); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2)) ret = ret2; if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_dio: filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; } int ext4_inode_attach_jinode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct jbd2_inode *jinode; if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) return 0; jinode = jbd2_alloc_inode(GFP_KERNEL); spin_lock(&inode->i_lock); if (!ei->jinode) { if (!jinode) { spin_unlock(&inode->i_lock); return -ENOMEM; } ei->jinode = jinode; jbd2_journal_init_jbd_inode(ei->jinode, inode); jinode = NULL; } spin_unlock(&inode->i_lock); if (unlikely(jinode != NULL)) jbd2_free_inode(jinode); return 0; } /* * ext4_truncate() * * We block out ext4_get_block() block instantiations across the entire * transaction, and VFS/VM ensures that ext4_truncate() cannot run * simultaneously on behalf of the same inode. * * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * * The file's tree may be transiently inconsistent in memory (although it * probably isn't), but whenever we close off and commit a journal transaction, * the contents of (the filesystem + the journal) must be consistent and * restartable. It's pretty simple, really: bottom up, right to left (although * left-to-right works OK too). * * Note that at recovery time, journal replay occurs *before* the restart of * truncate against the orphan inode list. * * The committed inode has the new, desired i_size (which is the same as * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see * that this inode's truncate did not complete and it will again call * ext4_truncate() to have another go. So there will be instantiated blocks * to the right of the truncation point in a crashed ext4 filesystem. But * that's fine - as long as they are linked from the inode, the post-crash * ext4_truncate() run will find them and release them. */ int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; int err = 0, err2; handle_t *handle; struct address_space *mapping = inode->i_mapping; /* * There is a possibility that we're either freeing the inode * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) goto out_trace; if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); if (ext4_has_inline_data(inode)) { int has_inline = 1; err = ext4_inline_data_truncate(inode, &has_inline); if (err || has_inline) goto out_trace; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { err = ext4_inode_attach_jinode(inode); if (err) goto out_trace; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_trace; } if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); /* * We add the inode to the orphan list, so that if this * truncate spans multiple transactions, and we crash, we will * resume the truncate when the filesystem recovers. It also * marks the inode dirty, to catch the new size. * * Implication: the file must always be in a sane, consistent * truncatable state while each transaction commits. */ err = ext4_orphan_add(handle, inode); if (err) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); else ext4_ind_truncate(handle, inode); up_write(&ei->i_data_sem); if (err) goto out_stop; if (IS_SYNC(inode)) ext4_handle_sync(handle); out_stop: /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by * ext4_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) ext4_orphan_del(handle, inode); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; ext4_journal_stop(handle); out_trace: trace_ext4_truncate_exit(inode); return err; } static inline u64 ext4_inode_peek_iversion(const struct inode *inode) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return inode_peek_iversion_raw(inode); else return inode_peek_iversion(inode); } static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { struct inode *inode = &(ei->vfs_inode); u64 i_blocks = READ_ONCE(inode->i_blocks); struct super_block *sb = inode->i_sb; if (i_blocks <= ~0U) { /* * i_blocks can be represented in a 32 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } /* * This should never happen since sb->s_maxbytes should not have * allowed this, sb->s_maxbytes was set according to the huge_file * feature in ext4_fill_super(). */ if (!ext4_has_feature_huge_file(sb)) return -EFSCORRUPTED; if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); } else { ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } return 0; } static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) { struct ext4_inode_info *ei = EXT4_I(inode); uid_t i_uid; gid_t i_gid; projid_t i_projid; int block; int err; err = ext4_inode_blocks_set(raw_inode, ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, * old inodes get re-used with the upper 16 bits of the * uid/gid intact. */ if (ei->i_dtime && list_empty(&ei->i_orphan)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); ext4_isize_set(raw_inode, ei->i_disksize); raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { raw_inode->i_block[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); raw_inode->i_block[1] = 0; } else { raw_inode->i_block[0] = 0; raw_inode->i_block[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } } else if (!ext4_has_inline_data(inode)) { for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = ext4_inode_peek_iversion(inode); raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) raw_inode->i_version_hi = cpu_to_le32(ivers >> 32); raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } } if (i_projid != EXT4_DEF_PROJID && !ext4_has_feature_project(inode->i_sb)) err = err ?: -EFSCORRUPTED; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) raw_inode->i_projid = cpu_to_le32(i_projid); ext4_inode_csum_set(inode, raw_inode, ei); return err; } /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If we pass 'inode' and it does not * have in-inode xattr, we have all inode data in memory that is needed * to recreate the on-disk version of this inode. */ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, struct inode *inode, struct ext4_iloc *iloc, ext4_fsblk_t *ret_block) { struct ext4_group_desc *gdp; struct buffer_head *bh; ext4_fsblk_t block; struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; if (ino < EXT4_ROOT_INO || ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) return -EFSCORRUPTED; iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); if (!gdp) return -EIO; /* * Figure out the offset within the block group inode table */ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); block = ext4_inode_table(sb, gdp); if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { ext4_error(sb, "Invalid inode table block %llu in " "block_group %u", block, iloc->block_group); return -EFSCORRUPTED; } block += (inode_offset / inodes_per_block); bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; if (ext4_buffer_uptodate(bh)) goto has_buffer; lock_buffer(bh); if (ext4_buffer_uptodate(bh)) { /* Someone brought it uptodate while we waited */ unlock_buffer(bh); goto has_buffer; } /* * If we have all information of the inode in memory and this * is the only valid inode in the block, we need not read the * block. */ if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct buffer_head *bitmap_bh; int i, start; start = inode_offset & ~(inodes_per_block - 1); /* Is the inode bitmap in cache? */ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); if (unlikely(!bitmap_bh)) goto make_io; /* * If the inode bitmap isn't in cache then the * optimisation may end up performing two reads instead * of one, so skip it. */ if (!buffer_uptodate(bitmap_bh)) { brelse(bitmap_bh); goto make_io; } for (i = start; i < start + inodes_per_block; i++) { if (i == inode_offset) continue; if (ext4_test_bit(i, bitmap_bh->b_data)) break; } brelse(bitmap_bh); if (i == start + inodes_per_block) { struct ext4_inode *raw_inode = (struct ext4_inode *) (bh->b_data + iloc->offset); /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); if (!ext4_test_inode_state(inode, EXT4_STATE_NEW)) ext4_fill_raw_inode(inode, raw_inode); set_buffer_uptodate(bh); unlock_buffer(bh); goto has_buffer; } } make_io: /* * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) end = table; while (b <= end) ext4_sb_breadahead_unmovable(sb, b++); } /* * There are other valid inodes in the buffer, this inode * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ trace_ext4_load_inode(sb, ino); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); blk_finish_plug(&plug); wait_on_buffer(bh); ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; brelse(bh); return -EIO; } has_buffer: iloc->bh = bh; return 0; } static int __ext4_get_inode_loc_noinmem(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc) { return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL); } static bool ext4_should_enable_dax(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (test_opt2(inode->i_sb, DAX_NEVER)) return false; if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) return false; if (ext4_has_inline_data(inode)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) return false; if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) return false; if (test_opt(inode->i_sb, DAX_ALWAYS)) return true; return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } void ext4_set_inode_flags(struct inode *inode, bool init) { unsigned int flags = EXT4_I(inode)->i_flags; unsigned int new_fl = 0; WARN_ON_ONCE(IS_DAX(inode) && init); if (flags & EXT4_SYNC_FL) new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; /* Because of the way inode_set_flags() works we must preserve S_DAX * here if already set. */ new_fl |= (inode->i_flags & S_DAX); if (init && ext4_should_enable_dax(inode)) new_fl |= S_DAX; if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) new_fl |= S_CASEFOLD; if (flags & EXT4_VERITY_FL) new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| S_ENCRYPTED|S_CASEFOLD|S_VERITY); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { blkcnt_t i_blocks ; struct inode *inode = &(ei->vfs_inode); struct super_block *sb = inode->i_sb; if (ext4_has_feature_huge_file(sb)) { /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { /* i_blocks represent file system block size */ return i_blocks << (inode->i_blkbits - 9); } else { return i_blocks; } } else { return le32_to_cpu(raw_inode->i_blocks_lo); } } static inline int ext4_iget_extra_inode(struct inode *inode, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (EXT4_INODE_HAS_XATTR_SPACE(inode) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return err; } else EXT4_I(inode)->i_inline_off = 0; return 0; } int ext4_get_projid(struct inode *inode, kprojid_t *projid) { if (!ext4_has_feature_project(inode->i_sb)) return -EOPNOTSUPP; *projid = EXT4_I(inode)->i_projid; return 0; } /* * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag * set. */ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) inode_set_iversion_raw(inode, val); else inode_set_iversion_queried(inode, val); } static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) { if (flags & EXT4_IGET_EA_INODE) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return "missing EA_INODE flag"; if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || EXT4_I(inode)->i_file_acl) return "ea_inode with extended attributes"; } else { if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return "unexpected EA_INODE flag"; } if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) return "unexpected bad inode w/o EXT4_IGET_BAD"; return NULL; } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; const char *err_str; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; int block; uid_t i_uid; gid_t i_gid; projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || ino == le32_to_cpu(es->s_usr_quota_inum) || ino == le32_to_cpu(es->s_grp_quota_inum) || ino == le32_to_cpu(es->s_prj_quota_inum) || ino == le32_to_cpu(es->s_orphan_file_inum))) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, "inode #%lu: comm %s: iget: illegal inode #", ino, current->comm); return ERR_PTR(-EFSCORRUPTED); } inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { if ((err_str = check_igot_inode(inode, flags)) != NULL) { ext4_error_inode(inode, function, line, 0, err_str); iput(inode); return ERR_PTR(-EFSCORRUPTED); } return inode; } ei = EXT4_I(inode); iloc.bh = NULL; ret = __ext4_get_inode_loc_noinmem(inode, &iloc); if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; goto bad_inode; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { ext4_error_inode(inode, function, line, 0, "iget: bad extra_isize %u " "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } } else ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) && (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) { ext4_error_inode_err(inode, function, line, 0, EFSBADCRC, "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); else i_projid = EXT4_DEF_PROJID; if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes * the test is that same one that e2fsck uses * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted or unallocated */ if (flags & EXT4_IGET_SPECIAL) { ext4_error_inode(inode, function, line, 0, "iget: special inode unallocated"); ret = -EFSCORRUPTED; } else ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete * the process of deleting those. * OR it is the EXT4_BOOT_LOADER_INO which is * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); ext4_set_inode_flags(inode, true); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } /* * If dir_index is not enabled but there's dir with INDEX flag set, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); ret = -EFSCORRUPTED; goto bad_inode; } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! */ for (block = 0; block < EXT4_N_BLOCKS; block++) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); ext4_fc_init_inode(&ei->vfs_inode); /* * Set transaction id's of transactions that have to be committed * to finish f[data]sync. We set them to currently running transaction * as we cannot be sure that the inode or some of its metadata isn't * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ if (journal) { transaction_t *transaction; tid_t tid; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else transaction = journal->j_committing_transaction; if (transaction) tid = transaction->t_tid; else tid = journal->j_commit_sequence; read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { ret = ext4_iget_extra_inode(inode, raw_inode, ei); if (ret) goto bad_inode; } } EXT4_INODE_GET_CTIME(inode, raw_inode); EXT4_INODE_GET_ATIME(inode, raw_inode); EXT4_INODE_GET_MTIME(inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = le32_to_cpu(raw_inode->i_disk_version); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; if (ei->i_file_acl && !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; } else if (!ext4_has_inline_data(inode)) { /* validate the block references in the inode */ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode)))) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_check_inode(inode); else ret = ext4_ind_check_inode(inode); } } if (ret) goto bad_inode; if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { ext4_error_inode(inode, function, line, 0, "iget: immutable or append flags " "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); } else { inode->i_op = &ext4_symlink_inode_operations; } } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else if (ino == EXT4_BOOT_LOADER_INO) { make_bad_inode(inode); } else { ret = -EFSCORRUPTED; ext4_error_inode(inode, function, line, 0, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); ret = -EFSCORRUPTED; goto bad_inode; } if ((err_str = check_igot_inode(inode, flags)) != NULL) { ext4_error_inode(inode, function, line, 0, err_str); ret = -EFSCORRUPTED; goto bad_inode; } brelse(iloc.bh); unlock_new_inode(inode); return inode; bad_inode: brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } static void __ext4_update_other_inode_time(struct super_block *sb, unsigned long orig_ino, unsigned long ino, struct ext4_inode *raw_inode) { struct inode *inode; inode = find_inode_by_ino_rcu(sb, ino); if (!inode) return; if (!inode_is_dirtytime_only(inode)) return; spin_lock(&inode->i_lock); if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); trace_ext4_other_inode_update_time(inode, orig_ino); return; } spin_unlock(&inode->i_lock); } /* * Opportunistically update the other time fields for other inodes in * the same inode table block. */ static void ext4_update_other_inodes_time(struct super_block *sb, unsigned long orig_ino, char *buf) { unsigned long ino; int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int inode_size = EXT4_INODE_SIZE(sb); /* * Calculate the first inode in the inode table block. Inode * numbers are one-based. That is, the first inode in a block * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). */ ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; rcu_read_lock(); for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; __ext4_update_other_inode_time(sb, orig_ino, ino, (struct ext4_inode *)buf); } rcu_read_unlock(); } /* * Post the struct inode info into an on-disk inode location in the * buffer-cache. This gobbles the caller's reference to the * buffer_head in the inode location struct. * * The caller must have write access to iloc->bh. */ static int ext4_do_update_inode(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; struct super_block *sb = inode->i_sb; int err; int need_datasync = 0, set_large_file = 0; spin_lock(&ei->i_raw_lock); /* * For fields not tracked in the in-memory inode, initialise them * to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) need_datasync = 1; if (ei->i_disksize > 0x7fffffffULL) { if (!ext4_has_feature_large_file(sb) || EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) set_large_file = 1; } err = ext4_fill_raw_inode(inode, raw_inode); spin_unlock(&ei->i_raw_lock); if (err) { EXT4_ERROR_INODE(inode, "corrupted inode contents"); goto out_brelse; } if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (err) goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_sync(handle); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_error: ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); return err; } /* * ext4_write_inode() * * We are called from a few places: * * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * * - Within flush work (sys_sync(), kupdate and such). * We wait on commit, if told to. * * - Within iput_final() -> write_inode_now() * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in * which we are interested. * * It would be a bug for them to not do this. The code: * * mark_inode_dirty(inode) * stuff(); * inode->i_size = expr; * * is in error because write_inode() could occur while `stuff()' is running, * and the new i_size will be lost. Plus the inode will no longer be on the * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { ext4_debug("called recursively, non-PF_MEMALLOC!\n"); dump_stack(); return -EIO; } /* * No need to force transaction in WB_SYNC_NONE mode. Also * ext4_sync_fs() will force the commit after everything is * written. */ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; err = __ext4_get_inode_loc_noinmem(inode, &iloc); if (err) return err; /* * sync(2) will flush the whole buffer cache. No need to do * it here separately for each inode. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); } return err; } /* * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate * buffers that are attached to a folio straddling i_size and are undergoing * commit. In that case we have to wait for commit to finish and try again. */ static void ext4_wait_for_tail_page_commit(struct inode *inode) { unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid; int ret; bool has_transaction; offset = inode->i_size & (PAGE_SIZE - 1); /* * If the folio is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidate_folio() may * strip all buffers from the folio but keep the folio dirty which can then * confuse e.g. concurrent ext4_writepages() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. */ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); folio_unlock(folio); folio_put(folio); if (ret != -EBUSY) return; has_transaction = false; read_lock(&journal->j_state_lock); if (journal->j_committing_transaction) { commit_tid = journal->j_committing_transaction->t_tid; has_transaction = true; } read_unlock(&journal->j_state_lock); if (has_transaction) jbd2_log_wait_commit(journal, commit_tid); } } /* * ext4_setattr() * * Called from notify_change. * * We want to trap VFS attempts to truncate the file as soon as * possible. In particular, we want to make sure that when the VFS * shrinks i_size, we put the inode on the orphan list and modify * i_disksize immediately, so that during the subsequent flushing of * dirty pages and freeing of disk blocks, we can guarantee that any * commit will leave the blocks being flushed in an unused state on * disk. (On recovery, the inode will get truncated and the blocks will * be freed, so we have a strong guarantee that no future commit will * leave these blocks visible to the user.) * * Another thing we have to assure is that if we are in ordered mode * and inode is still attached to the committing transaction, we must * we start writeout of all the dirty pages which are being truncated. * This way we are sure that all the data written in the previous * transaction are already on disk (truncate waits for pages under * writeback). * * Called with inode->i_rwsem down. */ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; if (unlikely(IS_APPEND(inode) && (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; error = setattr_prepare(idmap, dentry, attr); if (error) return error; error = fscrypt_prepare_setattr(dentry, attr); if (error) return error; error = fsverity_prepare_setattr(dentry, attr); if (error) return error; if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } /* dquot_transfer() calls back ext4_get_inode_usage() which * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); error = dquot_transfer(idmap, inode, attr); up_read(&EXT4_I(inode)->xattr_sem); if (error) { ext4_journal_stop(handle); return error; } /* Update corresponding info in inode so that everything is in * one transaction */ i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); if (unlikely(error)) { return error; } } if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; loff_t oldsize = inode->i_size; loff_t old_disksize; int shrink = (attr->ia_size < inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (attr->ia_size > sbi->s_bitmap_maxbytes) { return -EFBIG; } } if (!S_ISREG(inode->i_mode)) { return -EINVAL; } if (attr->ia_size == inode->i_size) inc_ivers = false; if (shrink) { if (ext4_should_order_data(inode)) { error = ext4_begin_ordered_truncate(inode, attr->ia_size); if (error) goto err_out; } /* * Blocks are going to be removed from the inode. Wait * for dio in flight. */ inode_dio_wait(inode); } filemap_invalidate_lock(inode->i_mapping); rc = ext4_break_layouts(inode); if (rc) { filemap_invalidate_unlock(inode->i_mapping); goto err_out; } if (attr->ia_size != inode->i_size) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto out_mmap_sem; } if (ext4_handle_valid(handle) && shrink) { error = ext4_orphan_add(handle, inode); orphan = 1; } /* * Update c/mtime on truncate up, ext4_truncate() will * update c/mtime in shrink case below */ if (!shrink) inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (shrink) ext4_fc_track_range(handle, inode, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits, EXT_MAX_BLOCKS - 1); else ext4_fc_track_range( handle, inode, (oldsize > 0 ? oldsize - 1 : oldsize) >> inode->i_sb->s_blocksize_bits, (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >> inode->i_sb->s_blocksize_bits); down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); if (!error) error = rc; /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code * running ext4_wb_update_i_disksize(). */ if (!error) i_size_write(inode, attr->ia_size); else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error) goto out_mmap_sem; if (!shrink) { pagecache_isize_extended(inode, oldsize, inode->i_size); } else if (ext4_should_journal_data(inode)) { ext4_wait_for_tail_page_commit(inode); } } /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); /* * Call ext4_truncate() even if i_size didn't change to * truncate possible preallocated blocks. */ if (attr->ia_size <= oldsize) { rc = ext4_truncate(inode); if (rc) error = rc; } out_mmap_sem: filemap_invalidate_unlock(inode->i_mapping); } if (!error) { if (inc_ivers) inode_inc_iversion(inode); setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); } /* * If the call to ext4_truncate failed to get a transaction handle at * all, we need to clean up the in-core orphan list manually. */ if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) rc = posix_acl_chmod(idmap, dentry, inode->i_mode); err_out: if (error) ext4_std_error(inode->i_sb, error); if (!error) error = rc; return error; } u32 ext4_dio_alignment(struct inode *inode) { if (fsverity_active(inode)) return 0; if (ext4_should_journal_data(inode)) return 0; if (ext4_has_inline_data(inode)) return 0; if (IS_ENCRYPTED(inode)) { if (!fscrypt_dio_supported(inode)) return 0; return i_blocksize(inode); } return 1; /* use the iomap defaults */ } int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ext4_inode *raw_inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; if ((request_mask & STATX_BTIME) && EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) { stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; } /* * Return the DIO alignment restrictions if requested. We only return * this information when requested, since on encrypted files it might * take a fair bit of work to get if the file wasn't opened recently. */ if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) { u32 dio_align = ext4_dio_alignment(inode); stat->result_mask |= STATX_DIOALIGN; if (dio_align == 1) { struct block_device *bdev = inode->i_sb->s_bdev; /* iomap defaults */ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; stat->dio_offset_align = bdev_logical_block_size(bdev); } else { stat->dio_mem_align = dio_align; stat->dio_offset_align = dio_align; } } flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (flags & EXT4_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (flags & EXT4_COMPR_FL) stat->attributes |= STATX_ATTR_COMPRESSED; if (flags & EXT4_ENCRYPT_FL) stat->attributes |= STATX_ATTR_ENCRYPTED; if (flags & EXT4_IMMUTABLE_FL) stat->attributes |= STATX_ATTR_IMMUTABLE; if (flags & EXT4_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; if (flags & EXT4_VERITY_FL) stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP | STATX_ATTR_VERITY); generic_fillattr(idmap, request_mask, inode, stat); return 0; } int ext4_file_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); u64 delalloc_blocks; ext4_getattr(idmap, path, stat, request_mask, query_flags); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). * Report at least one sector for such files, so tools like tar, rsync, * others don't incorrectly think the file is completely sparse. */ if (unlikely(ext4_has_inline_data(inode))) stat->blocks += (stat->size + 511) >> 9; /* * We can't update i_blocks if the block allocation is delayed * otherwise in the case of system crash before the real block * allocation is done, we will have i_blocks inconsistent with * on-disk file blocks. * We always keep i_blocks updated together with real * allocation. But to not confuse with user, stat * will return the blocks that include the delayed allocation * blocks for this file. */ delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), EXT4_I(inode)->i_reserved_data_blocks); stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9); return 0; } static int ext4_index_trans_blocks(struct inode *inode, int lblocks, int pextents) { if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return ext4_ind_trans_blocks(inode, lblocks); return ext4_ext_index_trans_blocks(inode, pextents); } /* * Account for index blocks, block groups bitmaps and block group * descriptor blocks if modify datablocks and index blocks * worse case, the indexs blocks spread over different block groups * * If datablocks are discontiguous, they are possible to spread over * different block groups too. If they are contiguous, with flexbg, * they could still across block group boundary. * * Also account for superblock, inode, quota and xattr blocks */ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; int idxblocks; int ret; /* * How many index blocks need to touch to map @lblocks logical blocks * to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); ret = idxblocks; /* * Now let's see how many group bitmaps and group descriptors need * to account */ groups = idxblocks + pextents; gdpblocks = groups; if (groups > ngroups) groups = ngroups; if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ ret += groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); return ret; } /* * Calculate the total number of credits to reserve to fit * the modification of a single pages into a single transaction, * which may include multiple chunks of block allocations. * * This could be called via ext4_write_begin() * * We need to consider the worse case, when * one new block per extent. */ int ext4_writepage_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); int ret; ret = ext4_meta_trans_blocks(inode, bpp, bpp); /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) ret += bpp; return ret; } /* * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. */ int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) { return ext4_meta_trans_blocks(inode, nrblocks, 1); } /* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. */ int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int err = 0; if (unlikely(ext4_forced_shutdown(inode->i_sb))) { put_bh(iloc->bh); return -EIO; } ext4_fc_track_inode(handle, inode); /* the do_update_inode consumes one bh->b_count */ get_bh(iloc->bh); /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ err = ext4_do_update_inode(handle, inode, iloc); put_bh(iloc->bh); return err; } /* * On success, We end up with an outstanding reference count against * iloc->bh. This _must_ be cleaned up later. */ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { int err; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; err = ext4_get_inode_loc(inode, iloc); if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, EXT4_JTR_NONE); if (err) { brelse(iloc->bh); iloc->bh = NULL; } } ext4_std_error(inode->i_sb, err); return err; } static int __ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc, handle_t *handle, int *no_expand) { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int error; /* this was checked at iget time, but double check for good measure */ if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) || (ei->i_extra_isize & 3)) { EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); return -EFSCORRUPTED; } if ((new_extra_isize < ei->i_extra_isize) || (new_extra_isize < 4) || (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE)) return -EINVAL; /* Should never happen */ raw_inode = ext4_raw_inode(iloc); header = IHDR(inode, raw_inode); /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize, 0, new_extra_isize - EXT4_I(inode)->i_extra_isize); EXT4_I(inode)->i_extra_isize = new_extra_isize; return 0; } /* * We may need to allocate external xattr block so we need quotas * initialized. Here we can be called with various locks held so we * cannot affort to initialize quotas ourselves. So just bail. */ if (dquot_initialize_needed(inode)) return -EAGAIN; /* try to expand with EAs present */ error = ext4_expand_extra_isize_ea(inode, new_extra_isize, raw_inode, handle); if (error) { /* * Inode size expansion failed; don't try again */ *no_expand = 1; } return error; } /* * Expand an inode by new_extra_isize bytes. * Returns 0 on success or negative error number on failure. */ static int ext4_try_to_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc iloc, handle_t *handle) { int no_expand; int error; if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) return -EOVERFLOW; /* * In nojournal mode, we can immediately attempt to expand * the inode. When journaled, we first need to obtain extra * buffer credits since we may write into the EA block * with this same handle. If journal_extend fails, then it will * only result in a minor loss of functionality for that inode. * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ if (ext4_journal_extend(handle, EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0) return -ENOSPC; if (ext4_write_trylock_xattr(inode, &no_expand) == 0) return -EBUSY; error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc, handle, &no_expand); ext4_write_unlock_xattr(inode, &no_expand); return error; } int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, struct ext4_iloc *iloc) { handle_t *handle; int no_expand; int error, rc; if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { brelse(iloc->bh); return -EOVERFLOW; } handle = ext4_journal_start(inode, EXT4_HT_INODE, EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { error = PTR_ERR(handle); brelse(iloc->bh); return error; } ext4_write_lock_xattr(inode, &no_expand); BUFFER_TRACE(iloc->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, EXT4_JTR_NONE); if (error) { brelse(iloc->bh); goto out_unlock; } error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, handle, &no_expand); rc = ext4_mark_iloc_dirty(handle, inode, iloc); if (!error) error = rc; out_unlock: ext4_write_unlock_xattr(inode, &no_expand); ext4_journal_stop(handle); return error; } /* * What we do here is to mark the in-core inode as clean with respect to inode * dirtiness (it may still be data-dirty). * This means that the in-core inode may be reaped by prune_icache * without having to perform any I/O. This is a very good thing, * because *any* task may call prune_icache - even ones which * have a transaction open against a different journal. * * Is this cheating? Not really. Sure, we haven't written the * inode out, but prune_icache isn't a user-visible syncing function. * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. */ int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, const char *func, unsigned int line) { struct ext4_iloc iloc; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int err; might_sleep(); trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out; if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); err = ext4_mark_iloc_dirty(handle, inode, &iloc); out: if (unlikely(err)) ext4_error_inode_err(inode, func, line, 0, err, "mark_inode_dirty error"); return err; } /* * ext4_dirty_inode() is called from __mark_inode_dirty() * * We're really interested in the case where a file is being extended. * i_size has been changed by generic_commit_write() and we thus need * to include the updated inode in the current transaction. * * Also, dquot_alloc_block() will always dirty the inode when blocks * are allocated to the file. * * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return; ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); } int ext4_change_inode_journal_flag(struct inode *inode, int val) { journal_t *journal; handle_t *handle; int err; int alloc_ctx; /* * We have to be very careful here: changing a data block's * journaling status dynamically is dangerous. If we write a * data block to the journal, change the status and then delete * that block, we risk forgetting to revoke the old log record * from the journal and so a subsequent replay can corrupt data. * So, first we make sure that the journal is empty and that * nobody is changing anything. */ journal = EXT4_JOURNAL(inode); if (!journal) return 0; if (is_journal_aborted(journal)) return -EROFS; /* Wait for all existing dio workers */ inode_dio_wait(inode); /* * Before flushing the journal and switching inode's aops, we have * to flush all dirty data the inode has. There can be outstanding * delayed allocations, there can be unwritten extents created by * fallocate or buffered writes in dioread_nolock mode covered by * dirty data which can be converted only after flushing the dirty * data (and journalled aops don't know how to handle these cases). */ if (val) { filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err < 0) { filemap_invalidate_unlock(inode->i_mapping); return err; } } alloc_ctx = ext4_writepages_down_write(inode->i_sb); jbd2_journal_lock_updates(journal); /* * OK, there are no updates running now, and all cached data is * synced to disk. We are now in a completely consistent state * which doesn't have anything in the journal, and we know that * no filesystem updates are running, so it is safe to modify * the inode's in-core data-journaling state flag now. */ if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); else { err = jbd2_journal_flush(journal, 0); if (err < 0) { jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); return err; } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); ext4_writepages_up_write(inode->i_sb, alloc_ctx); if (val) filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle); err = ext4_mark_inode_dirty(handle, inode); ext4_handle_sync(handle); ext4_journal_stop(handle); ext4_std_error(inode->i_sb, err); return err; } static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, struct buffer_head *bh) { return !buffer_mapped(bh); } vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *folio = page_folio(vmf->page); loff_t size; unsigned long len; int err; vm_fault_t ret; struct file *file = vma->vm_file; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; handle_t *handle; get_block_t *get_block; int retries = 0; if (unlikely(IS_IMMUTABLE(inode))) return VM_FAULT_SIGBUS; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); filemap_invalidate_lock_shared(mapping); err = ext4_convert_inline_data(inode); if (err) goto out_ret; /* * On data journalling we skip straight to the transaction handle: * there's no delalloc; page truncated will be checked later; the * early return w/ all buffers mapped (calculates size/len) can't * be used; and there's no dioread_nolock, so only ext4_get_block. */ if (ext4_should_journal_data(inode)) goto retry_alloc; /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb)) { do { err = block_page_mkwrite(vma, vmf, ext4_da_get_block_prep); } while (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)); goto out_ret; } folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ if (folio->mapping != mapping || folio_pos(folio) > size) { folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } len = folio_size(folio); if (folio_pos(folio) + len > size) len = size - folio_pos(folio); /* * Return if we have all the buffers mapped. This avoids the need to do * journal_start/journal_stop which can block and take a long time * * This cannot be done for data journalling, as we have to add the * inode to the transaction's list to writeprotect pages on commit. */ if (folio_buffers(folio)) { if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ folio_wait_stable(folio); ret = VM_FAULT_LOCKED; goto out; } } folio_unlock(folio); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; else get_block = ext4_get_block; retry_alloc: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = VM_FAULT_SIGBUS; goto out; } /* * Data journalling can't use block_page_mkwrite() because it * will set_buffer_dirty() before do_journal_get_write_access() * thus might hit warning messages for dirty metadata buffers. */ if (!ext4_should_journal_data(inode)) { err = block_page_mkwrite(vma, vmf, get_block); } else { folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ if (folio->mapping != mapping || folio_pos(folio) > size) { ret = VM_FAULT_NOPAGE; goto out_error; } len = folio_size(folio); if (folio_pos(folio) + len > size) len = size - folio_pos(folio); err = ext4_block_write_begin(handle, folio, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; if (ext4_journal_folio_buffers(handle, folio, len)) goto out_error; } else { folio_unlock(folio); } } ext4_journal_stop(handle); if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_alloc; out_ret: ret = vmf_fs_error(err); out: filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; out_error: folio_unlock(folio); ext4_journal_stop(handle); goto out; } |
3 3 5 1 1 4 4 4 4 1 1 2 2 1 1 2 2 6 1 2 6 6 4 2 4 1 6 2 2 1 3 4 9 8 5 8 3 6 1 7 1 1 7 9 4 2 10 10 6 6 2 4 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */ #include <linux/skmsg.h> #include <linux/filter.h> #include <linux/bpf.h> #include <linux/init.h> #include <linux/wait.h> #include <linux/util_macros.h> #include <net/inet_common.h> #include <net/tls.h> void tcp_eat_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tcp; int copied; if (!skb || !skb->len || !sk_is_tcp(sk)) return; if (skb_bpf_strparser(skb)) return; tcp = tcp_sk(sk); copied = tcp->copied_seq + skb->len; WRITE_ONCE(tcp->copied_seq, copied); tcp_rcv_space_adjust(sk); __tcp_cleanup_rbuf(sk, skb->len); } static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, u32 apply_bytes) { bool apply = apply_bytes; struct scatterlist *sge; u32 size, copied = 0; struct sk_msg *tmp; int i, ret = 0; tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL); if (unlikely(!tmp)) return -ENOMEM; lock_sock(sk); tmp->sg.start = msg->sg.start; i = msg->sg.start; do { sge = sk_msg_elem(msg, i); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; if (!sk_wmem_schedule(sk, size)) { if (!copied) ret = -ENOMEM; break; } sk_mem_charge(sk, size); sk_msg_xfer(tmp, msg, i, size); copied += size; if (sge->length) get_page(sk_msg_page(tmp, i)); sk_msg_iter_var_next(i); tmp->sg.end = i; if (apply) { apply_bytes -= size; if (!apply_bytes) { if (sge->length) sk_msg_iter_var_prev(i); break; } } } while (i != msg->sg.end); if (!ret) { msg->sg.start = i; sk_psock_queue_msg(psock, tmp); sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); kfree(tmp); } release_sock(sk); return ret; } static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, int flags, bool uncharge) { struct msghdr msghdr = {}; bool apply = apply_bytes; struct scatterlist *sge; struct page *page; int size, ret = 0; u32 off; while (1) { struct bio_vec bvec; bool has_tx_ulp; sge = sk_msg_elem(msg, msg->sg.start); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; off = sge->offset; page = sg_page(sge); tcp_rate_check_app_limited(sk); retry: msghdr.msg_flags = flags | MSG_SPLICE_PAGES; has_tx_ulp = tls_sw_has_ctx_tx(sk); if (has_tx_ulp) msghdr.msg_flags |= MSG_SENDPAGE_NOPOLICY; if (size < sge->length && msg->sg.start != msg->sg.end) msghdr.msg_flags |= MSG_MORE; bvec_set_page(&bvec, page, size, off); iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, size); ret = tcp_sendmsg_locked(sk, &msghdr, size); if (ret <= 0) return ret; if (apply) apply_bytes -= ret; msg->sg.size -= ret; sge->offset += ret; sge->length -= ret; if (uncharge) sk_mem_uncharge(sk, ret); if (ret != size) { size -= ret; off += ret; goto retry; } if (!sge->length) { put_page(page); sk_msg_iter_next(msg, start); sg_init_table(sge, 1); if (msg->sg.start == msg->sg.end) break; } if (apply && !apply_bytes) break; } return 0; } static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, u32 apply_bytes, int flags, bool uncharge) { int ret; lock_sock(sk); ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge); release_sock(sk); return ret; } int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, struct sk_msg *msg, u32 bytes, int flags) { struct sk_psock *psock = sk_psock_get(sk); int ret; if (unlikely(!psock)) return -EPIPE; ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes) : tcp_bpf_push_locked(sk, msg, bytes, flags, false); sk_psock_put(sk, psock); return ret; } EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); #ifdef CONFIG_BPF_SYSCALL static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); ret = sk_wait_event(sk, &timeo, !list_empty(&psock->ingress_msg) || !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static bool is_next_msg_fin(struct sk_psock *psock) { struct scatterlist *sge; struct sk_msg *msg_rx; int i; msg_rx = sk_psock_peek_msg(psock); i = msg_rx->sg.start; sge = sk_msg_elem(msg_rx, i); if (!sge->length) { struct sk_buff *skb = msg_rx->skb; if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) return true; } return false; } static int tcp_bpf_recvmsg_parser(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct tcp_sock *tcp = tcp_sk(sk); int peek = flags & MSG_PEEK; u32 seq = tcp->copied_seq; struct sk_psock *psock; int copied = 0; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, flags, addr_len); lock_sock(sk); /* We may have received data on the sk_receive_queue pre-accept and * then we can not use read_skb in this context because we haven't * assigned a sk_socket yet so have no link to the ops. The work-around * is to check the sk_receive_queue and in these cases read skbs off * queue again. The read_skb hook is not running at this point because * of lock_sock so we avoid having multiple runners in read_skb. */ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { tcp_data_ready(sk); /* This handles the ENOMEM errors if we both receive data * pre accept and are already under memory pressure. At least * let user know to retry. */ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) { copied = -EAGAIN; goto out; } } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); /* The typical case for EFAULT is the socket was gracefully * shutdown with a FIN pkt. So check here the other case is * some error on copy_page_to_iter which would be unexpected. * On fin return correct return code to zero. */ if (copied == -EFAULT) { bool is_fin = is_next_msg_fin(psock); if (is_fin) { copied = 0; seq++; goto out; } } seq += copied; if (!copied) { long timeo; int data; if (sock_flag(sk, SOCK_DONE)) goto out; if (sk->sk_err) { copied = sock_error(sk); goto out; } if (sk->sk_shutdown & RCV_SHUTDOWN) goto out; if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; goto out; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); if (!timeo) { copied = -EAGAIN; goto out; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); goto out; } data = tcp_msg_wait_data(sk, psock, timeo); if (data < 0) { copied = data; goto unlock; } if (data && !sk_psock_queue_empty(psock)) goto msg_bytes_ready; copied = -EAGAIN; } out: if (!peek) WRITE_ONCE(tcp->copied_seq, seq); tcp_rcv_space_adjust(sk); if (copied > 0) __tcp_cleanup_rbuf(sk, copied); unlock: release_sock(sk); sk_psock_put(sk, psock); return copied; } static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct sk_psock *psock; int copied, ret; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, flags, addr_len); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, flags, addr_len); } lock_sock(sk); msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = tcp_msg_wait_data(sk, psock, timeo); if (data < 0) { ret = data; goto unlock; } if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; release_sock(sk); sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, flags, addr_len); } copied = -EAGAIN; } ret = copied; unlock: release_sock(sk); sk_psock_put(sk, psock); return ret; } static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { bool cork = false, enospc = sk_msg_full(msg), redir_ingress; struct sock *sk_redir; u32 tosend, origsize, sent, delta = 0; u32 eval; int ret; more_data: if (psock->eval == __SK_NONE) { /* Track delta in msg size to add/subtract it on SK_DROP from * returned to user copied size. This ensures user doesn't * get a positive return code with msg_cut_data and SK_DROP * verdict. */ delta = msg->sg.size; psock->eval = sk_psock_msg_verdict(sk, psock, msg); delta -= msg->sg.size; } if (msg->cork_bytes && msg->cork_bytes > msg->sg.size && !enospc) { psock->cork_bytes = msg->cork_bytes - msg->sg.size; if (!psock->cork) { psock->cork = kzalloc(sizeof(*psock->cork), GFP_ATOMIC | __GFP_NOWARN); if (!psock->cork) return -ENOMEM; } memcpy(psock->cork, msg, sizeof(*msg)); return 0; } tosend = msg->sg.size; if (psock->apply_bytes && psock->apply_bytes < tosend) tosend = psock->apply_bytes; eval = __SK_NONE; switch (psock->eval) { case __SK_PASS: ret = tcp_bpf_push(sk, msg, tosend, flags, true); if (unlikely(ret)) { *copied -= sk_msg_free(sk, msg); break; } sk_msg_apply_bytes(psock, tosend); break; case __SK_REDIRECT: redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; sk_msg_apply_bytes(psock, tosend); if (!psock->apply_bytes) { /* Clean up before releasing the sock lock. */ eval = psock->eval; psock->eval = __SK_NONE; psock->sk_redir = NULL; } if (psock->cork) { cork = true; psock->cork = NULL; } sk_msg_return(sk, msg, tosend); release_sock(sk); origsize = msg->sg.size; ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, msg, tosend, flags); sent = origsize - msg->sg.size; if (eval == __SK_REDIRECT) sock_put(sk_redir); lock_sock(sk); if (unlikely(ret < 0)) { int free = sk_msg_free_nocharge(sk, msg); if (!cork) *copied -= free; } if (cork) { sk_msg_free(sk, msg); kfree(msg); msg = NULL; ret = 0; } break; case __SK_DROP: default: sk_msg_free_partial(sk, msg, tosend); sk_msg_apply_bytes(psock, tosend); *copied -= (tosend + delta); return -EACCES; } if (likely(!ret)) { if (!psock->apply_bytes) { psock->eval = __SK_NONE; if (psock->sk_redir) { sock_put(psock->sk_redir); psock->sk_redir = NULL; } } if (msg && msg->sg.data[msg->sg.start].page_link && msg->sg.data[msg->sg.start].length) { if (eval == __SK_REDIRECT) sk_mem_charge(sk, tosend - sent); goto more_data; } } return ret; } static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; int copied = 0, err = 0; struct sk_psock *psock; long timeo; int flags; /* Don't let internal flags through */ flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); flags |= MSG_NO_SHARED_FRAGS; psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_sendmsg(sk, msg, size); lock_sock(sk); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); while (msg_data_left(msg)) { bool enospc = false; u32 copy, osize; if (sk->sk_err) { err = -sk->sk_err; goto out_err; } copy = msg_data_left(msg); if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; if (psock->cork) { msg_tx = psock->cork; } else { msg_tx = &tmp; sk_msg_init(msg_tx); } osize = msg_tx->sg.size; err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1); if (err) { if (err != -ENOSPC) goto wait_for_memory; enospc = true; copy = msg_tx->sg.size - osize; } err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, copy); if (err < 0) { sk_msg_trim(sk, msg_tx, osize); goto out_err; } copied += copy; if (psock->cork_bytes) { if (size > psock->cork_bytes) psock->cork_bytes = 0; else psock->cork_bytes -= size; if (psock->cork_bytes && !enospc) goto out_err; /* All cork bytes are accounted, rerun the prog. */ psock->eval = __SK_NONE; psock->cork_bytes = 0; } err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags); if (unlikely(err < 0)) goto out_err; continue; wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: err = sk_stream_wait_memory(sk, &timeo); if (err) { if (msg_tx && msg_tx != psock->cork) sk_msg_free(sk, msg_tx); goto out_err; } } out_err: if (err < 0) err = sk_stream_error(sk, msg->msg_flags, err); release_sock(sk); sk_psock_put(sk, psock); return copied > 0 ? copied : err; } enum { TCP_BPF_IPV4, TCP_BPF_IPV6, TCP_BPF_NUM_PROTS, }; enum { TCP_BPF_BASE, TCP_BPF_TX, TCP_BPF_RX, TCP_BPF_TXRX, TCP_BPF_NUM_CFGS, }; static struct proto *tcpv6_prot_saved __read_mostly; static DEFINE_SPINLOCK(tcpv6_prot_lock); static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS]; static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], struct proto *base) { prot[TCP_BPF_BASE] = *base; prot[TCP_BPF_BASE].destroy = sock_map_destroy; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; prot[TCP_BPF_RX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser; prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX]; prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser; } static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) { spin_lock_bh(&tcpv6_prot_lock); if (likely(ops != tcpv6_prot_saved)) { tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops); smp_store_release(&tcpv6_prot_saved, ops); } spin_unlock_bh(&tcpv6_prot_lock); } } static int __init tcp_bpf_v4_build_proto(void) { tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot); return 0; } late_initcall(tcp_bpf_v4_build_proto); static int tcp_bpf_assert_proto_ops(struct proto *ops) { /* In order to avoid retpoline, we make assumptions when we call * into ops if e.g. a psock is not present. Make sure they are * indeed valid assumptions. */ return ops->recvmsg == tcp_recvmsg && ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; } int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE; if (psock->progs.stream_verdict || psock->progs.skb_verdict) { config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX; } if (restore) { if (inet_csk_has_ulp(sk)) { /* TLS does not have an unhash proto in SW cases, * but we need to ensure we stop using the sock_map * unhash routine because the associated psock is being * removed. So use the original unhash handler. */ WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash); tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space); } else { sk->sk_write_space = psock->saved_write_space; /* Pairs with lockless read in sk_clone_lock() */ sock_replace_proto(sk, psock->sk_proto); } return 0; } if (sk->sk_family == AF_INET6) { if (tcp_bpf_assert_proto_ops(psock->sk_proto)) return -EINVAL; tcp_bpf_check_v6_needs_rebuild(psock->sk_proto); } /* Pairs with lockless read in sk_clone_lock() */ sock_replace_proto(sk, &tcp_bpf_prots[family][config]); return 0; } EXPORT_SYMBOL_GPL(tcp_bpf_update_proto); /* If a child got cloned from a listening socket that had tcp_bpf * protocol callbacks installed, we need to restore the callbacks to * the default ones because the child does not inherit the psock state * that tcp_bpf callbacks expect. */ void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { struct proto *prot = newsk->sk_prot; if (is_insidevar(prot, tcp_bpf_prots)) newsk->sk_prot = sk->sk_prot_creator; } #endif /* CONFIG_BPF_SYSCALL */ |
3 3 3 4 1 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/export.h> #include "rds.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); EXPORT_PER_CPU_SYMBOL_GPL(rds_stats); /* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */ static const char *const rds_stat_names[] = { "conn_reset", "recv_drop_bad_checksum", "recv_drop_old_seq", "recv_drop_no_sock", "recv_drop_dead_sock", "recv_deliver_raced", "recv_delivered", "recv_queued", "recv_immediate_retry", "recv_delayed_retry", "recv_ack_required", "recv_rdma_bytes", "recv_ping", "send_queue_empty", "send_queue_full", "send_lock_contention", "send_lock_queue_raced", "send_immediate_retry", "send_delayed_retry", "send_drop_acked", "send_ack_required", "send_queued", "send_rdma", "send_rdma_bytes", "send_pong", "page_remainder_hit", "page_remainder_miss", "copy_to_user", "copy_from_user", "cong_update_queued", "cong_update_received", "cong_send_error", "cong_send_blocked", "recv_bytes_added_to_sock", "recv_bytes_freed_fromsock", "send_stuck_rm", }; void rds_stats_info_copy(struct rds_info_iterator *iter, uint64_t *values, const char *const *names, size_t nr) { struct rds_info_counter ctr; size_t i; for (i = 0; i < nr; i++) { BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); ctr.name[sizeof(ctr.name) - 1] = '\0'; ctr.value = values[i]; rds_info_copy(iter, &ctr, sizeof(ctr)); } } EXPORT_SYMBOL_GPL(rds_stats_info_copy); /* * This gives global counters across all the transports. The strings * are copied in so that the tool doesn't need knowledge of the specific * stats that we're exporting. Some are pretty implementation dependent * and may change over time. That doesn't stop them from being useful. * * This is the only function in the chain that knows about the byte granular * length in userspace. It converts it to number of stat entries that the * rest of the functions operate in. */ static void rds_stats_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds_statistics stats = {0, }; uint64_t *src; uint64_t *sum; size_t i; int cpu; unsigned int avail; avail = len / sizeof(struct rds_info_counter); if (avail < ARRAY_SIZE(rds_stat_names)) { avail = 0; goto trans; } for_each_online_cpu(cpu) { src = (uint64_t *)&(per_cpu(rds_stats, cpu)); sum = (uint64_t *)&stats; for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) *(sum++) += *(src++); } rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names, ARRAY_SIZE(rds_stat_names)); avail -= ARRAY_SIZE(rds_stat_names); trans: lens->each = sizeof(struct rds_info_counter); lens->nr = rds_trans_stats_info_copy(iter, avail) + ARRAY_SIZE(rds_stat_names); } void rds_stats_exit(void) { rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); } int rds_stats_init(void) { rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); return 0; } |
3 80 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMAN_H #define _LINUX_MMAN_H #include <linux/mm.h> #include <linux/percpu_counter.h> #include <linux/atomic.h> #include <uapi/linux/mman.h> /* * Arrange for legacy / undefined architecture specific flags to be * ignored by mmap handling code. */ #ifndef MAP_32BIT #define MAP_32BIT 0 #endif #ifndef MAP_ABOVE4G #define MAP_ABOVE4G 0 #endif #ifndef MAP_HUGE_2MB #define MAP_HUGE_2MB 0 #endif #ifndef MAP_HUGE_1GB #define MAP_HUGE_1GB 0 #endif #ifndef MAP_UNINITIALIZED #define MAP_UNINITIALIZED 0 #endif #ifndef MAP_SYNC #define MAP_SYNC 0 #endif /* * The historical set of flags that all mmap implementations implicitly * support when a ->mmap_validate() op is not provided in file_operations. * * MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the * kernel. */ #define LEGACY_MAP_MASK (MAP_SHARED \ | MAP_PRIVATE \ | MAP_FIXED \ | MAP_ANONYMOUS \ | MAP_DENYWRITE \ | MAP_EXECUTABLE \ | MAP_UNINITIALIZED \ | MAP_GROWSDOWN \ | MAP_LOCKED \ | MAP_NORESERVE \ | MAP_POPULATE \ | MAP_NONBLOCK \ | MAP_STACK \ | MAP_HUGETLB \ | MAP_32BIT \ | MAP_ABOVE4G \ | MAP_HUGE_2MB \ | MAP_HUGE_1GB) extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; extern struct percpu_counter vm_committed_as; #ifdef CONFIG_SMP extern s32 vm_committed_as_batch; extern void mm_compute_batch(int overcommit_policy); #else #define vm_committed_as_batch 0 static inline void mm_compute_batch(int overcommit_policy) { } #endif unsigned long vm_memory_committed(void); static inline void vm_acct_memory(long pages) { percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch); } static inline void vm_unacct_memory(long pages) { vm_acct_memory(-pages); } /* * Allow architectures to handle additional protection and flag bits. The * overriding macros must be defined in the arch-specific asm/mman.h file. */ #ifndef arch_calc_vm_prot_bits #define arch_calc_vm_prot_bits(prot, pkey) 0 #endif #ifndef arch_calc_vm_flag_bits #define arch_calc_vm_flag_bits(flags) 0 #endif #ifndef arch_validate_prot /* * This is called from mprotect(). PROT_GROWSDOWN and PROT_GROWSUP have * already been masked out. * * Returns true if the prot flags are valid */ static inline bool arch_validate_prot(unsigned long prot, unsigned long addr) { return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0; } #define arch_validate_prot arch_validate_prot #endif #ifndef arch_validate_flags /* * This is called from mmap() and mprotect() with the updated vma->vm_flags. * * Returns true if the VM_* flags are valid. */ static inline bool arch_validate_flags(unsigned long flags) { return true; } #define arch_validate_flags arch_validate_flags #endif /* * Optimisation macro. It is equivalent to: * (x & bit1) ? bit2 : 0 * but this version is faster. * ("bit1" and "bit2" must be single bits) */ #define _calc_vm_trans(x, bit1, bit2) \ ((!(bit1) || !(bit2)) ? 0 : \ ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \ : ((x) & (bit1)) / ((bit1) / (bit2)))) /* * Combine the mmap "prot" argument into "vm_flags" used internally. */ static inline unsigned long calc_vm_prot_bits(unsigned long prot, unsigned long pkey) { return _calc_vm_trans(prot, PROT_READ, VM_READ ) | _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) | _calc_vm_trans(prot, PROT_EXEC, VM_EXEC) | arch_calc_vm_prot_bits(prot, pkey); } /* * Combine the mmap "flags" argument into "vm_flags" used internally. */ static inline unsigned long calc_vm_flag_bits(unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | _calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) | arch_calc_vm_flag_bits(flags); } unsigned long vm_commit_limit(void); #ifndef arch_memory_deny_write_exec_supported static inline bool arch_memory_deny_write_exec_supported(void) { return true; } #define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported #endif /* * Denies creating a writable executable mapping or gaining executable permissions. * * This denies the following: * * a) mmap(PROT_WRITE | PROT_EXEC) * * b) mmap(PROT_WRITE) * mprotect(PROT_EXEC) * * c) mmap(PROT_WRITE) * mprotect(PROT_READ) * mprotect(PROT_EXEC) * * But allows the following: * * d) mmap(PROT_READ | PROT_EXEC) * mmap(PROT_READ | PROT_EXEC | PROT_BTI) */ static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) return false; if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE)) return true; if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC)) return true; return false; } #endif /* _LINUX_MMAN_H */ |
2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 | // SPDX-License-Identifier: GPL-2.0-or-later /* LRW: as defined by Cyril Guyot in * http://grouper.ieee.org/groups/1619/email/pdf00017.pdf * * Copyright (c) 2006 Rik Snel <rsnel@cube.dyndns.org> * * Based on ecb.c * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ /* This implementation is checked against the test vectors in the above * document and by a test vector provided by Ken Buchanan at * https://www.mail-archive.com/stds-p1619@listserv.ieee.org/msg00173.html * * The test vectors are included in the testing module tcrypt.[ch] */ #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <crypto/b128ops.h> #include <crypto/gf128mul.h> #define LRW_BLOCK_SIZE 16 struct lrw_tfm_ctx { struct crypto_skcipher *child; /* * optimizes multiplying a random (non incrementing, as at the * start of a new sector) value with key2, we could also have * used 4k optimization tables or no optimization at all. In the * latter case we would have to store key2 here */ struct gf128mul_64k *table; /* * stores: * key2*{ 0,0,...0,0,0,0,1 }, key2*{ 0,0,...0,0,0,1,1 }, * key2*{ 0,0,...0,0,1,1,1 }, key2*{ 0,0,...0,1,1,1,1 } * key2*{ 0,0,...1,1,1,1,1 }, etc * needed for optimized multiplication of incrementing values * with key2 */ be128 mulinc[128]; }; struct lrw_request_ctx { be128 t; struct skcipher_request subreq; }; static inline void lrw_setbit128_bbe(void *b, int bit) { __set_bit(bit ^ (0x80 - #ifdef __BIG_ENDIAN BITS_PER_LONG #else BITS_PER_BYTE #endif ), b); } static int lrw_setkey(struct crypto_skcipher *parent, const u8 *key, unsigned int keylen) { struct lrw_tfm_ctx *ctx = crypto_skcipher_ctx(parent); struct crypto_skcipher *child = ctx->child; int err, bsize = LRW_BLOCK_SIZE; const u8 *tweak = key + keylen - bsize; be128 tmp = { 0 }; int i; crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(child, key, keylen - bsize); if (err) return err; if (ctx->table) gf128mul_free_64k(ctx->table); /* initialize multiplication table for Key2 */ ctx->table = gf128mul_init_64k_bbe((be128 *)tweak); if (!ctx->table) return -ENOMEM; /* initialize optimization table */ for (i = 0; i < 128; i++) { lrw_setbit128_bbe(&tmp, i); ctx->mulinc[i] = tmp; gf128mul_64k_bbe(&ctx->mulinc[i], ctx->table); } return 0; } /* * Returns the number of trailing '1' bits in the words of the counter, which is * represented by 4 32-bit words, arranged from least to most significant. * At the same time, increments the counter by one. * * For example: * * u32 counter[4] = { 0xFFFFFFFF, 0x1, 0x0, 0x0 }; * int i = lrw_next_index(&counter); * // i == 33, counter == { 0x0, 0x2, 0x0, 0x0 } */ static int lrw_next_index(u32 *counter) { int i, res = 0; for (i = 0; i < 4; i++) { if (counter[i] + 1 != 0) return res + ffz(counter[i]++); counter[i] = 0; res += 32; } /* * If we get here, then x == 128 and we are incrementing the counter * from all ones to all zeros. This means we must return index 127, i.e. * the one corresponding to key2*{ 1,...,1 }. */ return 127; } /* * We compute the tweak masks twice (both before and after the ECB encryption or * decryption) to avoid having to allocate a temporary buffer and/or make * mutliple calls to the 'ecb(..)' instance, which usually would be slower than * just doing the lrw_next_index() calls again. */ static int lrw_xor_tweak(struct skcipher_request *req, bool second_pass) { const int bs = LRW_BLOCK_SIZE; struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct lrw_tfm_ctx *ctx = crypto_skcipher_ctx(tfm); struct lrw_request_ctx *rctx = skcipher_request_ctx(req); be128 t = rctx->t; struct skcipher_walk w; __be32 *iv; u32 counter[4]; int err; if (second_pass) { req = &rctx->subreq; /* set to our TFM to enforce correct alignment: */ skcipher_request_set_tfm(req, tfm); } err = skcipher_walk_virt(&w, req, false); if (err) return err; iv = (__be32 *)w.iv; counter[0] = be32_to_cpu(iv[3]); counter[1] = be32_to_cpu(iv[2]); counter[2] = be32_to_cpu(iv[1]); counter[3] = be32_to_cpu(iv[0]); while (w.nbytes) { unsigned int avail = w.nbytes; be128 *wsrc; be128 *wdst; wsrc = w.src.virt.addr; wdst = w.dst.virt.addr; do { be128_xor(wdst++, &t, wsrc++); /* T <- I*Key2, using the optimization * discussed in the specification */ be128_xor(&t, &t, &ctx->mulinc[lrw_next_index(counter)]); } while ((avail -= bs) >= bs); if (second_pass && w.nbytes == w.total) { iv[0] = cpu_to_be32(counter[3]); iv[1] = cpu_to_be32(counter[2]); iv[2] = cpu_to_be32(counter[1]); iv[3] = cpu_to_be32(counter[0]); } err = skcipher_walk_done(&w, avail); } return err; } static int lrw_xor_tweak_pre(struct skcipher_request *req) { return lrw_xor_tweak(req, false); } static int lrw_xor_tweak_post(struct skcipher_request *req) { return lrw_xor_tweak(req, true); } static void lrw_crypt_done(void *data, int err) { struct skcipher_request *req = data; if (!err) { struct lrw_request_ctx *rctx = skcipher_request_ctx(req); rctx->subreq.base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; err = lrw_xor_tweak_post(req); } skcipher_request_complete(req, err); } static void lrw_init_crypt(struct skcipher_request *req) { const struct lrw_tfm_ctx *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); struct lrw_request_ctx *rctx = skcipher_request_ctx(req); struct skcipher_request *subreq = &rctx->subreq; skcipher_request_set_tfm(subreq, ctx->child); skcipher_request_set_callback(subreq, req->base.flags, lrw_crypt_done, req); /* pass req->iv as IV (will be used by xor_tweak, ECB will ignore it) */ skcipher_request_set_crypt(subreq, req->dst, req->dst, req->cryptlen, req->iv); /* calculate first value of T */ memcpy(&rctx->t, req->iv, sizeof(rctx->t)); /* T <- I*Key2 */ gf128mul_64k_bbe(&rctx->t, ctx->table); } static int lrw_encrypt(struct skcipher_request *req) { struct lrw_request_ctx *rctx = skcipher_request_ctx(req); struct skcipher_request *subreq = &rctx->subreq; lrw_init_crypt(req); return lrw_xor_tweak_pre(req) ?: crypto_skcipher_encrypt(subreq) ?: lrw_xor_tweak_post(req); } static int lrw_decrypt(struct skcipher_request *req) { struct lrw_request_ctx *rctx = skcipher_request_ctx(req); struct skcipher_request *subreq = &rctx->subreq; lrw_init_crypt(req); return lrw_xor_tweak_pre(req) ?: crypto_skcipher_decrypt(subreq) ?: lrw_xor_tweak_post(req); } static int lrw_init_tfm(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct crypto_skcipher_spawn *spawn = skcipher_instance_ctx(inst); struct lrw_tfm_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *cipher; cipher = crypto_spawn_skcipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; crypto_skcipher_set_reqsize(tfm, crypto_skcipher_reqsize(cipher) + sizeof(struct lrw_request_ctx)); return 0; } static void lrw_exit_tfm(struct crypto_skcipher *tfm) { struct lrw_tfm_ctx *ctx = crypto_skcipher_ctx(tfm); if (ctx->table) gf128mul_free_64k(ctx->table); crypto_free_skcipher(ctx->child); } static void lrw_free_instance(struct skcipher_instance *inst) { crypto_drop_skcipher(skcipher_instance_ctx(inst)); kfree(inst); } static int lrw_create(struct crypto_template *tmpl, struct rtattr **tb) { struct crypto_skcipher_spawn *spawn; struct skcipher_alg_common *alg; struct skcipher_instance *inst; const char *cipher_name; char ecb_name[CRYPTO_MAX_ALG_NAME]; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask); if (err) return err; cipher_name = crypto_attr_alg_name(tb[1]); if (IS_ERR(cipher_name)) return PTR_ERR(cipher_name); inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = skcipher_instance_ctx(inst); err = crypto_grab_skcipher(spawn, skcipher_crypto_instance(inst), cipher_name, 0, mask); if (err == -ENOENT) { err = -ENAMETOOLONG; if (snprintf(ecb_name, CRYPTO_MAX_ALG_NAME, "ecb(%s)", cipher_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; err = crypto_grab_skcipher(spawn, skcipher_crypto_instance(inst), ecb_name, 0, mask); } if (err) goto err_free_inst; alg = crypto_spawn_skcipher_alg_common(spawn); err = -EINVAL; if (alg->base.cra_blocksize != LRW_BLOCK_SIZE) goto err_free_inst; if (alg->ivsize) goto err_free_inst; err = crypto_inst_setname(skcipher_crypto_instance(inst), "lrw", &alg->base); if (err) goto err_free_inst; err = -EINVAL; cipher_name = alg->base.cra_name; /* Alas we screwed up the naming so we have to mangle the * cipher name. */ if (!strncmp(cipher_name, "ecb(", 4)) { int len; len = strscpy(ecb_name, cipher_name + 4, sizeof(ecb_name)); if (len < 2) goto err_free_inst; if (ecb_name[len - 1] != ')') goto err_free_inst; ecb_name[len - 1] = 0; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "lrw(%s)", ecb_name) >= CRYPTO_MAX_ALG_NAME) { err = -ENAMETOOLONG; goto err_free_inst; } } else goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = LRW_BLOCK_SIZE; inst->alg.base.cra_alignmask = alg->base.cra_alignmask | (__alignof__(be128) - 1); inst->alg.ivsize = LRW_BLOCK_SIZE; inst->alg.min_keysize = alg->min_keysize + LRW_BLOCK_SIZE; inst->alg.max_keysize = alg->max_keysize + LRW_BLOCK_SIZE; inst->alg.base.cra_ctxsize = sizeof(struct lrw_tfm_ctx); inst->alg.init = lrw_init_tfm; inst->alg.exit = lrw_exit_tfm; inst->alg.setkey = lrw_setkey; inst->alg.encrypt = lrw_encrypt; inst->alg.decrypt = lrw_decrypt; inst->free = lrw_free_instance; err = skcipher_register_instance(tmpl, inst); if (err) { err_free_inst: lrw_free_instance(inst); } return err; } static struct crypto_template lrw_tmpl = { .name = "lrw", .create = lrw_create, .module = THIS_MODULE, }; static int __init lrw_module_init(void) { return crypto_register_template(&lrw_tmpl); } static void __exit lrw_module_exit(void) { crypto_unregister_template(&lrw_tmpl); } subsys_initcall(lrw_module_init); module_exit(lrw_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LRW block cipher mode"); MODULE_ALIAS_CRYPTO("lrw"); MODULE_SOFTDEP("pre: ecb"); |
383 6 236 97 72 127 30 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * net busy poll support * Copyright(c) 2013 Intel Corporation. * * Author: Eliezer Tamir * * Contact Information: * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net> */ #ifndef _LINUX_NET_BUSY_POLL_H #define _LINUX_NET_BUSY_POLL_H #include <linux/netdevice.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <net/ip.h> #include <net/xdp.h> /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu * NR_CPUS+1..~0 - Region available for NAPI IDs */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) #define BUSY_POLL_BUDGET 8 #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; extern unsigned int sysctl_net_busy_read __read_mostly; extern unsigned int sysctl_net_busy_poll __read_mostly; static inline bool net_busy_loop_on(void) { return READ_ONCE(sysctl_net_busy_poll); } static inline bool sk_can_busy_loop(const struct sock *sk) { return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current); } bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { return 0; } static inline bool sk_can_busy_loop(struct sock *sk) { return false; } #endif /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long busy_loop_current_time(void) { #ifdef CONFIG_NET_RX_BUSY_POLL return (unsigned long)(ktime_get_ns() >> 10); #else return 0; #endif } /* in poll/select we use the global sysctl_net_ll_poll value */ static inline bool busy_loop_timeout(unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline bool sk_busy_loop_timeout(struct sock *sk, unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline void sk_busy_loop(struct sock *sk, int nonblock) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id >= MIN_NAPI_ID) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, READ_ONCE(sk->sk_prefer_busy_poll), READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); #endif } /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, struct napi_struct *napi) { #ifdef CONFIG_NET_RX_BUSY_POLL /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ if (skb->napi_id < MIN_NAPI_ID) skb->napi_id = napi->napi_id; #endif } /* used in the protocol handler to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id)) WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_update(sk, skb); } /* Variant of sk_mark_napi_id() for passive flow setup, * as sk->sk_napi_id and sk->sk_rx_queue_mapping content * needs to be set. */ static inline void sk_mark_napi_id_set(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_set(sk, skb); } static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) { #ifdef CONFIG_NET_RX_BUSY_POLL if (!READ_ONCE(sk->sk_napi_id)) WRITE_ONCE(sk->sk_napi_id, napi_id); #endif } /* variant used for unconnected sockets */ static inline void sk_mark_napi_id_once(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL __sk_mark_napi_id_once(sk, skb->napi_id); #endif } static inline void sk_mark_napi_id_once_xdp(struct sock *sk, const struct xdp_buff *xdp) { #ifdef CONFIG_NET_RX_BUSY_POLL __sk_mark_napi_id_once(sk, xdp->rxq->napi_id); #endif } #endif /* _LINUX_NET_BUSY_POLL_H */ |
71 70 70 29 41 46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 | /* SPDX-License-Identifier: GPL-2.0 * * page_pool/helpers.h * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ /** * DOC: page_pool allocator * * The page_pool allocator is optimized for recycling page or page fragment used * by skb packet and xdp frame. * * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), * which allocate memory with or without page splitting depending on the * requested memory size. * * If the driver knows that it always requires full pages or its allocations are * always smaller than half a page, it can use one of the more specific API * calls: * * 1. page_pool_alloc_pages(): allocate memory without page splitting when * driver knows that the memory it need is always bigger than half of the page * allocated from page pool. There is no cache line dirtying for 'struct page' * when a page is recycled back to the page pool. * * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver * knows that the memory it need is always smaller than or equal to half of the * page allocated from page pool. Page splitting enables memory saving and thus * avoids TLB/cache miss for data access, but there also is some cost to * implement page splitting, mainly some cache line dirtying/bouncing for * 'struct page' and atomic operation for page->pp_ref_count. * * The API keeps track of in-flight pages, in order to let API users know when * it is safe to free a page_pool object, the API users must call * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * * page_pool_put_page() may be called multiple times on the same page if a page * is split into multiple fragments. For the last fragment, it will either * recycle the page, or in case of page->_refcount > 1, it will release the DMA * mapping and in-flight state accounting. * * dma_sync_single_range_for_device() is only called for the last fragment when * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the * last freed fragment to do the sync_for_device operation for all fragments in * the same page when a page is split. The API user must setup pool->p.max_len * and pool->p.offset correctly and ensure that page_pool_put_page() is called * with dma_sync_size being -1 for fragment API. */ #ifndef _NET_PAGE_POOL_HELPERS_H #define _NET_PAGE_POOL_HELPERS_H #include <linux/dma-mapping.h> #include <net/page_pool/types.h> #include <net/net_debug.h> #include <net/netmem.h> #ifdef CONFIG_PAGE_POOL_STATS /* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats); bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); #else static inline int page_pool_ethtool_stats_get_count(void) { return 0; } static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) { return data; } static inline u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { return data; } #endif /** * page_pool_dev_alloc_pages() - allocate a page. * @pool: pool from which to allocate * * Get a page from the page allocator or page_pool caches. */ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_pages(pool, gfp); } /** * page_pool_dev_alloc_frag() - allocate a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: requested size * * Get a page fragment from the page allocator or page_pool caches. * * Return: * Return allocated page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_frag(pool, offset, size, gfp); } static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; struct page *page; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; return page_pool_alloc_pages(pool, gfp); } page = page_pool_alloc_frag(pool, offset, *size, gfp); if (unlikely(!page)) return NULL; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize * underestimate problem. */ if (pool->frag_offset + *size > max_size) { *size = max_size - *offset; pool->frag_offset = max_size; } return page; } /** * page_pool_dev_alloc() - allocate a page or a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: in as the requested size, out as the allocated size * * Get a page or a page fragment from the page allocator or page_pool caches * depending on the requested size in order to allocate memory with least memory * utilization and performance penalty. * * Return: * Return allocated page or page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc(pool, offset, size, gfp); } static inline void *page_pool_alloc_va(struct page_pool *pool, unsigned int *size, gfp_t gfp) { unsigned int offset; struct page *page; /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */ page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM); if (unlikely(!page)) return NULL; return page_address(page) + offset; } /** * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its * va. * @pool: pool from which to allocate * @size: in as the requested size, out as the allocated size * * This is just a thin wrapper around the page_pool_alloc() API, and * it returns va of the allocated page or page fragment. * * Return: * Return the va for the allocated page or page fragment, otherwise return NULL. */ static inline void *page_pool_dev_alloc_va(struct page_pool *pool, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_va(pool, size, gfp); } /** * page_pool_get_dma_dir() - Retrieve the stored DMA direction. * @pool: pool from which page was allocated * * Get the stored dma direction. A driver might decide to store this locally * and avoid the extra cache line from page_pool to determine the direction. */ static inline enum dma_data_direction page_pool_get_dma_dir(const struct page_pool *pool) { return pool->p.dma_dir; } static inline void page_pool_fragment_netmem(netmem_ref netmem, long nr) { atomic_long_set(netmem_get_pp_ref_count_ref(netmem), nr); } /** * page_pool_fragment_page() - split a fresh page into fragments * @page: page to split * @nr: references to set * * pp_ref_count represents the number of outstanding references to the page, * which will be freed using page_pool APIs (rather than page allocator APIs * like put_page()). Such references are usually held by page_pool-aware * objects like skbs marked for page pool recycling. * * This helper allows the caller to take (set) multiple references to a * freshly allocated page. The page must be freshly allocated (have a * pp_ref_count of 1). This is commonly done by drivers and * "fragment allocators" to save atomic operations - either when they know * upfront how many references they will need; or to take MAX references and * return the unused ones with a single atomic dec(), instead of performing * multiple atomic inc() operations. */ static inline void page_pool_fragment_page(struct page *page, long nr) { page_pool_fragment_netmem(page_to_netmem(page), nr); } static inline long page_pool_unref_netmem(netmem_ref netmem, long nr) { atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem); long ret; /* If nr == pp_ref_count then we have cleared all remaining * references to the page: * 1. 'n == 1': no need to actually overwrite it. * 2. 'n != 1': overwrite it with one, which is the rare case * for pp_ref_count draining. * * The main advantage to doing this is that not only we avoid a atomic * update, as an atomic_read is generally a much cheaper operation than * an atomic update, especially when dealing with a page that may be * referenced by only 2 or 3 users; but also unify the pp_ref_count * handling by ensuring all pages have partitioned into only 1 piece * initially, and only overwrite it when the page is partitioned into * more than one piece. */ if (atomic_long_read(pp_ref_count) == nr) { /* As we have ensured nr is always one for constant case using * the BUILD_BUG_ON(), only need to handle the non-constant case * here for pp_ref_count draining, which is a rare case. */ BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); if (!__builtin_constant_p(nr)) atomic_long_set(pp_ref_count, 1); return 0; } ret = atomic_long_sub_return(nr, pp_ref_count); WARN_ON(ret < 0); /* We are the last user here too, reset pp_ref_count back to 1 to * ensure all pages have been partitioned into 1 piece initially, * this should be the rare case when the last two fragment users call * page_pool_unref_page() currently. */ if (unlikely(!ret)) atomic_long_set(pp_ref_count, 1); return ret; } static inline long page_pool_unref_page(struct page *page, long nr) { return page_pool_unref_netmem(page_to_netmem(page), nr); } static inline void page_pool_ref_netmem(netmem_ref netmem) { atomic_long_inc(&netmem_to_page(netmem)->pp_ref_count); } static inline void page_pool_ref_page(struct page *page) { page_pool_ref_netmem(page_to_netmem(page)); } static inline bool page_pool_is_last_ref(netmem_ref netmem) { /* If page_pool_unref_page() returns 0, we were the last user */ return page_pool_unref_netmem(netmem, 1) == 0; } static inline void page_pool_put_netmem(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL if (!page_pool_is_last_ref(netmem)) return; page_pool_put_unrefed_netmem(pool, netmem, dma_sync_size, allow_direct); #endif } /** * page_pool_put_page() - release a reference to a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @dma_sync_size: how much of the page may have been touched by the device * @allow_direct: released by the consumer, allow lockless caching * * The outcome of this depends on the page refcnt. If the driver bumps * the refcnt > 1 this will unmap the page. If the page refcnt is 1 * the allocator owns the page and will try to recycle it in one of the pool * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device * using dma_sync_single_range_for_device(). */ static inline void page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { page_pool_put_netmem(pool, page_to_netmem(page), dma_sync_size, allow_direct); } static inline void page_pool_put_full_netmem(struct page_pool *pool, netmem_ref netmem, bool allow_direct) { page_pool_put_netmem(pool, netmem, -1, allow_direct); } /** * page_pool_put_full_page() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @allow_direct: released by the consumer, allow lockless caching * * Similar to page_pool_put_page(), but will DMA sync the entire memory area * as configured in &page_pool_params.max_len. */ static inline void page_pool_put_full_page(struct page_pool *pool, struct page *page, bool allow_direct) { page_pool_put_netmem(pool, page_to_netmem(page), -1, allow_direct); } /** * page_pool_recycle_direct() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * * Similar to page_pool_put_full_page() but caller must guarantee safe context * (e.g NAPI), since it will recycle the page directly into the pool fast cache. */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { page_pool_put_full_page(pool, page, true); } #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) /** * page_pool_free_va() - free a va into the page_pool * @pool: pool from which va was allocated * @va: va to be freed * @allow_direct: freed by the consumer, allow lockless caching * * Free a va allocated from page_pool_allo_va(). */ static inline void page_pool_free_va(struct page_pool *pool, void *va, bool allow_direct) { page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct); } static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) { dma_addr_t ret = netmem_get_dma_addr(netmem); if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) ret <<= PAGE_SHIFT; return ret; } /** * page_pool_get_dma_addr() - Retrieve the stored DMA address. * @page: page allocated from a page pool * * Fetch the DMA address of the page. The page pool to which the page belongs * must had been created with PP_FLAG_DMA_MAP. */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { return page_pool_get_dma_addr_netmem(page_to_netmem((struct page *)page)); } /** * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW * @pool: &page_pool the @page belongs to * @page: page to sync * @offset: offset from page start to "hard" start if using PP frags * @dma_sync_size: size of the data written to the page * * Can be used as a shorthand to sync Rx pages before accessing them in the * driver. Caller must ensure the pool was created with ``PP_FLAG_DMA_MAP``. * Note that this version performs DMA sync unconditionally, even if the * associated PP doesn't perform sync-for-device. */ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, const struct page *page, u32 offset, u32 dma_sync_size) { dma_sync_single_range_for_cpu(pool->p.dev, page_pool_get_dma_addr(page), offset + pool->p.offset, dma_sync_size, page_pool_get_dma_dir(pool)); } static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); } static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) { if (unlikely(pool->p.nid != new_nid)) page_pool_update_nid(pool, new_nid); } #endif /* _NET_PAGE_POOL_HELPERS_H */ |
3 5 3 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 | /* SPDX-License-Identifier: GPL-2.0 */ /* Based on net/mac80211/trace.h */ #undef TRACE_SYSTEM #define TRACE_SYSTEM mac802154 #if !defined(__MAC802154_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) #define __MAC802154_DRIVER_TRACE #include <linux/tracepoint.h> #include <net/mac802154.h> #include "ieee802154_i.h" #define MAXNAME 32 #define LOCAL_ENTRY __array(char, wpan_phy_name, MAXNAME) #define LOCAL_ASSIGN strscpy(__entry->wpan_phy_name, \ wpan_phy_name(local->hw.phy), MAXNAME) #define LOCAL_PR_FMT "%s" #define LOCAL_PR_ARG __entry->wpan_phy_name #define CCA_ENTRY __field(enum nl802154_cca_modes, cca_mode) \ __field(enum nl802154_cca_opts, cca_opt) #define CCA_ASSIGN \ do { \ (__entry->cca_mode) = cca->mode; \ (__entry->cca_opt) = cca->opt; \ } while (0) #define CCA_PR_FMT "cca_mode: %d, cca_opt: %d" #define CCA_PR_ARG __entry->cca_mode, __entry->cca_opt #define BOOL_TO_STR(bo) (bo) ? "true" : "false" /* Tracing for driver callbacks */ DECLARE_EVENT_CLASS(local_only_evt4, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local), TP_STRUCT__entry( LOCAL_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; ), TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG) ); DEFINE_EVENT(local_only_evt4, 802154_drv_return_void, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local) ); TRACE_EVENT(802154_drv_return_int, TP_PROTO(struct ieee802154_local *local, int ret), TP_ARGS(local, ret), TP_STRUCT__entry( LOCAL_ENTRY __field(int, ret) ), TP_fast_assign( LOCAL_ASSIGN; __entry->ret = ret; ), TP_printk(LOCAL_PR_FMT ", returned: %d", LOCAL_PR_ARG, __entry->ret) ); DEFINE_EVENT(local_only_evt4, 802154_drv_start, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local) ); DEFINE_EVENT(local_only_evt4, 802154_drv_stop, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local) ); TRACE_EVENT(802154_drv_set_channel, TP_PROTO(struct ieee802154_local *local, u8 page, u8 channel), TP_ARGS(local, page, channel), TP_STRUCT__entry( LOCAL_ENTRY __field(u8, page) __field(u8, channel) ), TP_fast_assign( LOCAL_ASSIGN; __entry->page = page; __entry->channel = channel; ), TP_printk(LOCAL_PR_FMT ", page: %d, channel: %d", LOCAL_PR_ARG, __entry->page, __entry->channel) ); TRACE_EVENT(802154_drv_set_cca_mode, TP_PROTO(struct ieee802154_local *local, const struct wpan_phy_cca *cca), TP_ARGS(local, cca), TP_STRUCT__entry( LOCAL_ENTRY CCA_ENTRY ), TP_fast_assign( LOCAL_ASSIGN; CCA_ASSIGN; ), TP_printk(LOCAL_PR_FMT ", " CCA_PR_FMT, LOCAL_PR_ARG, CCA_PR_ARG) ); TRACE_EVENT(802154_drv_set_cca_ed_level, TP_PROTO(struct ieee802154_local *local, s32 mbm), TP_ARGS(local, mbm), TP_STRUCT__entry( LOCAL_ENTRY __field(s32, mbm) ), TP_fast_assign( LOCAL_ASSIGN; __entry->mbm = mbm; ), TP_printk(LOCAL_PR_FMT ", ed level: %d", LOCAL_PR_ARG, __entry->mbm) ); TRACE_EVENT(802154_drv_set_tx_power, TP_PROTO(struct ieee802154_local *local, s32 power), TP_ARGS(local, power), TP_STRUCT__entry( LOCAL_ENTRY __field(s32, power) ), TP_fast_assign( LOCAL_ASSIGN; __entry->power = power; ), TP_printk(LOCAL_PR_FMT ", mbm: %d", LOCAL_PR_ARG, __entry->power) ); TRACE_EVENT(802154_drv_set_lbt_mode, TP_PROTO(struct ieee802154_local *local, bool mode), TP_ARGS(local, mode), TP_STRUCT__entry( LOCAL_ENTRY __field(bool, mode) ), TP_fast_assign( LOCAL_ASSIGN; __entry->mode = mode; ), TP_printk(LOCAL_PR_FMT ", lbt mode: %s", LOCAL_PR_ARG, BOOL_TO_STR(__entry->mode)) ); TRACE_EVENT(802154_drv_set_short_addr, TP_PROTO(struct ieee802154_local *local, __le16 short_addr), TP_ARGS(local, short_addr), TP_STRUCT__entry( LOCAL_ENTRY __field(__le16, short_addr) ), TP_fast_assign( LOCAL_ASSIGN; __entry->short_addr = short_addr; ), TP_printk(LOCAL_PR_FMT ", short addr: 0x%04x", LOCAL_PR_ARG, le16_to_cpu(__entry->short_addr)) ); TRACE_EVENT(802154_drv_set_pan_id, TP_PROTO(struct ieee802154_local *local, __le16 pan_id), TP_ARGS(local, pan_id), TP_STRUCT__entry( LOCAL_ENTRY __field(__le16, pan_id) ), TP_fast_assign( LOCAL_ASSIGN; __entry->pan_id = pan_id; ), TP_printk(LOCAL_PR_FMT ", pan id: 0x%04x", LOCAL_PR_ARG, le16_to_cpu(__entry->pan_id)) ); TRACE_EVENT(802154_drv_set_extended_addr, TP_PROTO(struct ieee802154_local *local, __le64 extended_addr), TP_ARGS(local, extended_addr), TP_STRUCT__entry( LOCAL_ENTRY __field(__le64, extended_addr) ), TP_fast_assign( LOCAL_ASSIGN; __entry->extended_addr = extended_addr; ), TP_printk(LOCAL_PR_FMT ", extended addr: 0x%llx", LOCAL_PR_ARG, le64_to_cpu(__entry->extended_addr)) ); TRACE_EVENT(802154_drv_set_pan_coord, TP_PROTO(struct ieee802154_local *local, bool is_coord), TP_ARGS(local, is_coord), TP_STRUCT__entry( LOCAL_ENTRY __field(bool, is_coord) ), TP_fast_assign( LOCAL_ASSIGN; __entry->is_coord = is_coord; ), TP_printk(LOCAL_PR_FMT ", is_coord: %s", LOCAL_PR_ARG, BOOL_TO_STR(__entry->is_coord)) ); TRACE_EVENT(802154_drv_set_csma_params, TP_PROTO(struct ieee802154_local *local, u8 min_be, u8 max_be, u8 max_csma_backoffs), TP_ARGS(local, min_be, max_be, max_csma_backoffs), TP_STRUCT__entry( LOCAL_ENTRY __field(u8, min_be) __field(u8, max_be) __field(u8, max_csma_backoffs) ), TP_fast_assign( LOCAL_ASSIGN, __entry->min_be = min_be; __entry->max_be = max_be; __entry->max_csma_backoffs = max_csma_backoffs; ), TP_printk(LOCAL_PR_FMT ", min be: %d, max be: %d, max csma backoffs: %d", LOCAL_PR_ARG, __entry->min_be, __entry->max_be, __entry->max_csma_backoffs) ); TRACE_EVENT(802154_drv_set_max_frame_retries, TP_PROTO(struct ieee802154_local *local, s8 max_frame_retries), TP_ARGS(local, max_frame_retries), TP_STRUCT__entry( LOCAL_ENTRY __field(s8, max_frame_retries) ), TP_fast_assign( LOCAL_ASSIGN; __entry->max_frame_retries = max_frame_retries; ), TP_printk(LOCAL_PR_FMT ", max frame retries: %d", LOCAL_PR_ARG, __entry->max_frame_retries) ); TRACE_EVENT(802154_drv_set_promiscuous_mode, TP_PROTO(struct ieee802154_local *local, bool on), TP_ARGS(local, on), TP_STRUCT__entry( LOCAL_ENTRY __field(bool, on) ), TP_fast_assign( LOCAL_ASSIGN; __entry->on = on; ), TP_printk(LOCAL_PR_FMT ", promiscuous mode: %s", LOCAL_PR_ARG, BOOL_TO_STR(__entry->on)) ); TRACE_EVENT(802154_new_scan_event, TP_PROTO(struct ieee802154_coord_desc *desc), TP_ARGS(desc), TP_STRUCT__entry( __field(__le16, pan_id) __field(__le64, addr) __field(u8, channel) __field(u8, page) ), TP_fast_assign( __entry->page = desc->page; __entry->channel = desc->channel; __entry->pan_id = desc->addr.pan_id; __entry->addr = desc->addr.extended_addr; ), TP_printk("panid: %u, coord_addr: 0x%llx, page: %u, channel: %u", __le16_to_cpu(__entry->pan_id), __le64_to_cpu(__entry->addr), __entry->page, __entry->channel) ); DEFINE_EVENT(802154_new_scan_event, 802154_scan_event, TP_PROTO(struct ieee802154_coord_desc *desc), TP_ARGS(desc) ); #endif /* !__MAC802154_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h> |
1 1 1 1 32 23 2 5 2 2 1 2 3 3 3 2 11 10 5 10 3 3 1 1 10 1 2 1 1 1 2 2 4 1 4 5 3 10 10 6 3 44 1 1 1 10 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> struct nft_ct_helper_obj { struct nf_conntrack_helper *helper4; struct nf_conntrack_helper *helper6; u8 l4proto; }; #ifdef CONFIG_NF_CONNTRACK_ZONES static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; static DEFINE_MUTEX(nft_ct_pcpu_mutex); #endif static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, enum nft_ct_keys k, enum ip_conntrack_dir d) { if (d < IP_CT_DIR_MAX) return k == NFT_CT_BYTES ? atomic64_read(&c[d].bytes) : atomic64_read(&c[d].packets); return nft_ct_get_eval_counter(c, k, IP_CT_DIR_ORIGINAL) + nft_ct_get_eval_counter(c, k, IP_CT_DIR_REPLY); } static void nft_ct_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); u32 *dest = ®s->data[priv->dreg]; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; const struct nf_conn_help *help; const struct nf_conntrack_tuple *tuple; const struct nf_conntrack_helper *helper; unsigned int state; ct = nf_ct_get(pkt->skb, &ctinfo); switch (priv->key) { case NFT_CT_STATE: if (ct) state = NF_CT_STATE_BIT(ctinfo); else if (ctinfo == IP_CT_UNTRACKED) state = NF_CT_STATE_UNTRACKED_BIT; else state = NF_CT_STATE_INVALID_BIT; *dest = state; return; default: break; } if (ct == NULL) goto err; switch (priv->key) { case NFT_CT_DIRECTION: nft_reg_store8(dest, CTINFO2DIR(ctinfo)); return; case NFT_CT_STATUS: *dest = ct->status; return; #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: *dest = READ_ONCE(ct->mark); return; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: *dest = ct->secmark; return; #endif case NFT_CT_EXPIRATION: *dest = jiffies_to_msecs(nf_ct_expires(ct)); return; case NFT_CT_HELPER: if (ct->master == NULL) goto err; help = nfct_help(ct->master); if (help == NULL) goto err; helper = rcu_dereference(help->helper); if (helper == NULL) goto err; strscpy_pad((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); return; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: { struct nf_conn_labels *labels = nf_ct_labels_find(ct); if (labels) memcpy(dest, labels->bits, NF_CT_LABELS_MAX_SIZE); else memset(dest, 0, NF_CT_LABELS_MAX_SIZE); return; } #endif case NFT_CT_BYTES: case NFT_CT_PKTS: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); u64 count = 0; if (acct) count = nft_ct_get_eval_counter(acct->counter, priv->key, priv->dir); memcpy(dest, &count, sizeof(count)); return; } case NFT_CT_AVGPKT: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); u64 avgcnt = 0, bcnt = 0, pcnt = 0; if (acct) { pcnt = nft_ct_get_eval_counter(acct->counter, NFT_CT_PKTS, priv->dir); bcnt = nft_ct_get_eval_counter(acct->counter, NFT_CT_BYTES, priv->dir); if (pcnt != 0) avgcnt = div64_u64(bcnt, pcnt); } memcpy(dest, &avgcnt, sizeof(avgcnt)); return; } case NFT_CT_L3PROTOCOL: nft_reg_store8(dest, nf_ct_l3num(ct)); return; case NFT_CT_PROTOCOL: nft_reg_store8(dest, nf_ct_protonum(ct)); return; #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: { const struct nf_conntrack_zone *zone = nf_ct_zone(ct); u16 zoneid; if (priv->dir < IP_CT_DIR_MAX) zoneid = nf_ct_zone_id(zone, priv->dir); else zoneid = zone->id; nft_reg_store16(dest, zoneid); return; } #endif case NFT_CT_ID: *dest = nf_ct_get_id(ct); return; default: break; } tuple = &ct->tuplehash[priv->dir].tuple; switch (priv->key) { case NFT_CT_SRC: memcpy(dest, tuple->src.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_DST: memcpy(dest, tuple->dst.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_PROTO_SRC: nft_reg_store16(dest, (__force u16)tuple->src.u.all); return; case NFT_CT_PROTO_DST: nft_reg_store16(dest, (__force u16)tuple->dst.u.all); return; case NFT_CT_SRC_IP: if (nf_ct_l3num(ct) != NFPROTO_IPV4) goto err; *dest = (__force __u32)tuple->src.u3.ip; return; case NFT_CT_DST_IP: if (nf_ct_l3num(ct) != NFPROTO_IPV4) goto err; *dest = (__force __u32)tuple->dst.u3.ip; return; case NFT_CT_SRC_IP6: if (nf_ct_l3num(ct) != NFPROTO_IPV6) goto err; memcpy(dest, tuple->src.u3.ip6, sizeof(struct in6_addr)); return; case NFT_CT_DST_IP6: if (nf_ct_l3num(ct) != NFPROTO_IPV6) goto err; memcpy(dest, tuple->dst.u3.ip6, sizeof(struct in6_addr)); return; default: break; } return; err: regs->verdict.code = NFT_BREAK; } #ifdef CONFIG_NF_CONNTRACK_ZONES static void nft_ct_set_zone_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nf_conntrack_zone zone = { .dir = NF_CT_DEFAULT_ZONE_DIR }; const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; enum ip_conntrack_info ctinfo; u16 value = nft_reg_load16(®s->data[priv->sreg]); struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct) /* already tracked */ return; zone.id = value; switch (priv->dir) { case IP_CT_DIR_ORIGINAL: zone.dir = NF_CT_ZONE_DIR_ORIG; break; case IP_CT_DIR_REPLY: zone.dir = NF_CT_ZONE_DIR_REPL; break; default: break; } ct = this_cpu_read(nft_ct_pcpu_template); if (likely(refcount_read(&ct->ct_general.use) == 1)) { refcount_inc(&ct->ct_general.use); nf_ct_zone_add(ct, &zone); } else { /* previous skb got queued to userspace, allocate temporary * one until percpu template can be reused. */ ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC); if (!ct) { regs->verdict.code = NF_DROP; return; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); } nf_ct_set(skb, ct, IP_CT_NEW); } #endif static void nft_ct_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; #if defined(CONFIG_NF_CONNTRACK_MARK) || defined(CONFIG_NF_CONNTRACK_SECMARK) u32 value = regs->data[priv->sreg]; #endif enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL || nf_ct_is_template(ct)) return; switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: if (READ_ONCE(ct->mark) != value) { WRITE_ONCE(ct->mark, value); nf_conntrack_event_cache(IPCT_MARK, ct); } break; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: if (ct->secmark != value) { ct->secmark = value; nf_conntrack_event_cache(IPCT_SECMARK, ct); } break; #endif #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: nf_connlabels_replace(ct, ®s->data[priv->sreg], ®s->data[priv->sreg], NF_CT_LABELS_MAX_SIZE / sizeof(u32)); break; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS case NFT_CT_EVENTMASK: { struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); u32 ctmask = regs->data[priv->sreg]; if (e) { if (e->ctmask != ctmask) e->ctmask = ctmask; break; } if (ctmask && !nf_ct_is_confirmed(ct)) nf_ct_ecache_ext_add(ct, ctmask, 0, GFP_ATOMIC); break; } #endif default: break; } } static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { [NFTA_CT_DREG] = { .type = NLA_U32 }, [NFTA_CT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_CT_DIRECTION] = { .type = NLA_U8 }, [NFTA_CT_SREG] = { .type = NLA_U32 }, }; #ifdef CONFIG_NF_CONNTRACK_ZONES static void nft_ct_tmpl_put_pcpu(void) { struct nf_conn *ct; int cpu; for_each_possible_cpu(cpu) { ct = per_cpu(nft_ct_pcpu_template, cpu); if (!ct) break; nf_ct_put(ct); per_cpu(nft_ct_pcpu_template, cpu) = NULL; } } static bool nft_ct_tmpl_alloc_pcpu(void) { struct nf_conntrack_zone zone = { .id = 0 }; struct nf_conn *tmp; int cpu; if (nft_ct_pcpu_template_refcnt) return true; for_each_possible_cpu(cpu) { tmp = nf_ct_tmpl_alloc(&init_net, &zone, GFP_KERNEL); if (!tmp) { nft_ct_tmpl_put_pcpu(); return false; } __set_bit(IPS_CONFIRMED_BIT, &tmp->status); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } return true; } #endif static int nft_ct_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); unsigned int len; int err; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); priv->dir = IP_CT_DIR_MAX; switch (priv->key) { case NFT_CT_DIRECTION: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = sizeof(u8); break; case NFT_CT_STATE: case NFT_CT_STATUS: #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: #endif case NFT_CT_EXPIRATION: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = sizeof(u32); break; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = NF_CT_LABELS_MAX_SIZE; break; #endif case NFT_CT_HELPER: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = NF_CT_HELPER_NAME_LEN; break; case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: /* For compatibility, do not report error if NFTA_CT_DIRECTION * attribute is specified. */ len = sizeof(u8); break; case NFT_CT_SRC: case NFT_CT_DST: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; switch (ctx->family) { case NFPROTO_IPV4: len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip); break; case NFPROTO_IPV6: case NFPROTO_INET: len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6); break; default: return -EAFNOSUPPORT; } break; case NFT_CT_SRC_IP: case NFT_CT_DST_IP: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip); break; case NFT_CT_SRC_IP6: case NFT_CT_DST_IP6: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6); break; case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; len = sizeof_field(struct nf_conntrack_tuple, src.u.all); break; case NFT_CT_BYTES: case NFT_CT_PKTS: case NFT_CT_AVGPKT: len = sizeof(u64); break; #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: len = sizeof(u16); break; #endif case NFT_CT_ID: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof(u32); break; default: return -EOPNOTSUPP; } if (tb[NFTA_CT_DIRECTION] != NULL) { priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); switch (priv->dir) { case IP_CT_DIR_ORIGINAL: case IP_CT_DIR_REPLY: break; default: return -EINVAL; } } priv->len = len; err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); if (err < 0) return err; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) return err; if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS || priv->key == NFT_CT_AVGPKT) nf_ct_set_acct(ctx->net, true); return 0; } static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) { switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: nf_connlabels_put(ctx->net); break; #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: mutex_lock(&nft_ct_pcpu_mutex); if (--nft_ct_pcpu_template_refcnt == 0) nft_ct_tmpl_put_pcpu(); mutex_unlock(&nft_ct_pcpu_mutex); break; #endif default: break; } } static int nft_ct_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); unsigned int len; int err; priv->dir = IP_CT_DIR_MAX; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof_field(struct nf_conn, mark); break; #endif #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = NF_CT_LABELS_MAX_SIZE; err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); if (err) return err; break; #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: mutex_lock(&nft_ct_pcpu_mutex); if (!nft_ct_tmpl_alloc_pcpu()) { mutex_unlock(&nft_ct_pcpu_mutex); return -ENOMEM; } nft_ct_pcpu_template_refcnt++; mutex_unlock(&nft_ct_pcpu_mutex); len = sizeof(u16); break; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS case NFT_CT_EVENTMASK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof(u32); break; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof(u32); break; #endif default: return -EOPNOTSUPP; } if (tb[NFTA_CT_DIRECTION]) { priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); switch (priv->dir) { case IP_CT_DIR_ORIGINAL: case IP_CT_DIR_REPLY: break; default: err = -EINVAL; goto err1; } } priv->len = len; err = nft_parse_register_load(ctx, tb[NFTA_CT_SREG], &priv->sreg, len); if (err < 0) goto err1; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) goto err1; return 0; err1: __nft_ct_set_destroy(ctx, priv); return err; } static void nft_ct_get_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, ctx->family); } static void nft_ct_set_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_ct *priv = nft_expr_priv(expr); __nft_ct_set_destroy(ctx, priv); nf_ct_netns_put(ctx->net, ctx->family); } static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_CT_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; switch (priv->key) { case NFT_CT_SRC: case NFT_CT_DST: case NFT_CT_SRC_IP: case NFT_CT_DST_IP: case NFT_CT_SRC_IP6: case NFT_CT_DST_IP6: case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; break; case NFT_CT_BYTES: case NFT_CT_PKTS: case NFT_CT_AVGPKT: case NFT_CT_ZONE: if (priv->dir < IP_CT_DIR_MAX && nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } static bool nft_ct_get_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_ct *priv = nft_expr_priv(expr); const struct nft_ct *ct; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } ct = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->key != ct->key) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_CT_SREG, priv->sreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; switch (priv->key) { case NFT_CT_ZONE: if (priv->dir < IP_CT_DIR_MAX && nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_ct_type; static const struct nft_expr_ops nft_ct_get_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_get_eval, .init = nft_ct_get_init, .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, .reduce = nft_ct_get_reduce, }; static bool nft_ct_set_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { int i; for (i = 0; i < NFT_REG32_NUM; i++) { if (!track->regs[i].selector) continue; if (track->regs[i].selector->ops != &nft_ct_get_ops) continue; __nft_reg_track_cancel(track, i); } return false; } #ifdef CONFIG_MITIGATION_RETPOLINE static const struct nft_expr_ops nft_ct_get_fast_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_get_fast_eval, .init = nft_ct_get_init, .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, .reduce = nft_ct_set_reduce, }; #endif static const struct nft_expr_ops nft_ct_set_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_set_eval, .init = nft_ct_set_init, .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, .reduce = nft_ct_set_reduce, }; #ifdef CONFIG_NF_CONNTRACK_ZONES static const struct nft_expr_ops nft_ct_set_zone_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_set_zone_eval, .init = nft_ct_set_init, .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, .reduce = nft_ct_set_reduce, }; #endif static const struct nft_expr_ops * nft_ct_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_CT_KEY] == NULL) return ERR_PTR(-EINVAL); if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) return ERR_PTR(-EINVAL); if (tb[NFTA_CT_DREG]) { #ifdef CONFIG_MITIGATION_RETPOLINE u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (k) { case NFT_CT_STATE: case NFT_CT_DIRECTION: case NFT_CT_STATUS: case NFT_CT_MARK: case NFT_CT_SECMARK: return &nft_ct_get_fast_ops; } #endif return &nft_ct_get_ops; } if (tb[NFTA_CT_SREG]) { #ifdef CONFIG_NF_CONNTRACK_ZONES if (nla_get_be32(tb[NFTA_CT_KEY]) == htonl(NFT_CT_ZONE)) return &nft_ct_set_zone_ops; #endif return &nft_ct_set_ops; } return ERR_PTR(-EINVAL); } static struct nft_expr_type nft_ct_type __read_mostly = { .name = "ct", .select_ops = nft_ct_select_ops, .policy = nft_ct_policy, .maxattr = NFTA_CT_MAX, .owner = THIS_MODULE, }; static void nft_notrack_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct sk_buff *skb = pkt->skb; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(pkt->skb, &ctinfo); /* Previously seen (loopback or untracked)? Ignore. */ if (ct || ctinfo == IP_CT_UNTRACKED) return; nf_ct_set(skb, ct, IP_CT_UNTRACKED); } static struct nft_expr_type nft_notrack_type; static const struct nft_expr_ops nft_notrack_ops = { .type = &nft_notrack_type, .size = NFT_EXPR_SIZE(0), .eval = nft_notrack_eval, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_notrack_type __read_mostly = { .name = "notrack", .ops = &nft_notrack_ops, .owner = THIS_MODULE, }; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT static int nft_ct_timeout_parse_policy(void *timeouts, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { struct nlattr **tb; int ret = 0; tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), GFP_KERNEL); if (!tb) return -ENOMEM; ret = nla_parse_nested_deprecated(tb, l4proto->ctnl_timeout.nlattr_max, attr, l4proto->ctnl_timeout.nla_policy, NULL); if (ret < 0) goto err; ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); err: kfree(tb); return ret; } struct nft_ct_timeout_obj { struct nf_ct_timeout *timeout; u8 l4proto; }; static void nft_ct_timeout_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); struct nf_conn_timeout *timeout; const unsigned int *values; if (priv->l4proto != pkt->tprot) return; if (!ct || nf_ct_is_template(ct) || nf_ct_is_confirmed(ct)) return; timeout = nf_ct_timeout_find(ct); if (!timeout) { timeout = nf_ct_timeout_ext_add(ct, priv->timeout, GFP_ATOMIC); if (!timeout) { regs->verdict.code = NF_DROP; return; } } rcu_assign_pointer(timeout->timeout, priv->timeout); /* adjust the timeout as per 'new' state. ct is unconfirmed, * so the current timestamp must not be added. */ values = nf_ct_timeout_data(timeout); if (values) nf_ct_refresh(ct, pkt->skb, values[0]); } static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_ct_timeout_obj *priv = nft_obj_data(obj); const struct nf_conntrack_l4proto *l4proto; struct nf_ct_timeout *timeout; int l3num = ctx->family; __u8 l4num; int ret; if (!tb[NFTA_CT_TIMEOUT_L4PROTO] || !tb[NFTA_CT_TIMEOUT_DATA]) return -EINVAL; if (tb[NFTA_CT_TIMEOUT_L3PROTO]) l3num = ntohs(nla_get_be16(tb[NFTA_CT_TIMEOUT_L3PROTO])); l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]); priv->l4proto = l4num; l4proto = nf_ct_l4proto_find(l4num); if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; goto err_proto_put; } timeout = kzalloc(sizeof(struct nf_ct_timeout) + l4proto->ctnl_timeout.obj_size, GFP_KERNEL); if (timeout == NULL) { ret = -ENOMEM; goto err_proto_put; } ret = nft_ct_timeout_parse_policy(&timeout->data, l4proto, ctx->net, tb[NFTA_CT_TIMEOUT_DATA]); if (ret < 0) goto err_free_timeout; timeout->l3num = l3num; timeout->l4proto = l4proto; ret = nf_ct_netns_get(ctx->net, ctx->family); if (ret < 0) goto err_free_timeout; priv->timeout = timeout; return 0; err_free_timeout: kfree(timeout); err_proto_put: return ret; } static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_ct_timeout_obj *priv = nft_obj_data(obj); struct nf_ct_timeout *timeout = priv->timeout; nf_ct_untimeout(ctx->net, timeout); nf_ct_netns_put(ctx->net, ctx->family); kfree(priv->timeout); } static int nft_ct_timeout_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); const struct nf_ct_timeout *timeout = priv->timeout; struct nlattr *nest_params; int ret; if (nla_put_u8(skb, NFTA_CT_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) || nla_put_be16(skb, NFTA_CT_TIMEOUT_L3PROTO, htons(timeout->l3num))) return -1; nest_params = nla_nest_start(skb, NFTA_CT_TIMEOUT_DATA); if (!nest_params) return -1; ret = timeout->l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data); if (ret < 0) return -1; nla_nest_end(skb, nest_params); return 0; } static const struct nla_policy nft_ct_timeout_policy[NFTA_CT_TIMEOUT_MAX + 1] = { [NFTA_CT_TIMEOUT_L3PROTO] = {.type = NLA_U16 }, [NFTA_CT_TIMEOUT_L4PROTO] = {.type = NLA_U8 }, [NFTA_CT_TIMEOUT_DATA] = {.type = NLA_NESTED }, }; static struct nft_object_type nft_ct_timeout_obj_type; static const struct nft_object_ops nft_ct_timeout_obj_ops = { .type = &nft_ct_timeout_obj_type, .size = sizeof(struct nft_ct_timeout_obj), .eval = nft_ct_timeout_obj_eval, .init = nft_ct_timeout_obj_init, .destroy = nft_ct_timeout_obj_destroy, .dump = nft_ct_timeout_obj_dump, }; static struct nft_object_type nft_ct_timeout_obj_type __read_mostly = { .type = NFT_OBJECT_CT_TIMEOUT, .ops = &nft_ct_timeout_obj_ops, .maxattr = NFTA_CT_TIMEOUT_MAX, .policy = nft_ct_timeout_policy, .owner = THIS_MODULE, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_ct_helper_obj *priv = nft_obj_data(obj); struct nf_conntrack_helper *help4, *help6; char name[NF_CT_HELPER_NAME_LEN]; int family = ctx->family; int err; if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO]) return -EINVAL; priv->l4proto = nla_get_u8(tb[NFTA_CT_HELPER_L4PROTO]); if (!priv->l4proto) return -ENOENT; nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); if (tb[NFTA_CT_HELPER_L3PROTO]) family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO])); help4 = NULL; help6 = NULL; switch (family) { case NFPROTO_IPV4: if (ctx->family == NFPROTO_IPV6) return -EINVAL; help4 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; case NFPROTO_IPV6: if (ctx->family == NFPROTO_IPV4) return -EINVAL; help6 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; case NFPROTO_NETDEV: case NFPROTO_BRIDGE: case NFPROTO_INET: help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4, priv->l4proto); help6 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV6, priv->l4proto); break; default: return -EAFNOSUPPORT; } /* && is intentional; only error if INET found neither ipv4 or ipv6 */ if (!help4 && !help6) return -ENOENT; priv->helper4 = help4; priv->helper6 = help6; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) goto err_put_helper; return 0; err_put_helper: if (priv->helper4) nf_conntrack_helper_put(priv->helper4); if (priv->helper6) nf_conntrack_helper_put(priv->helper6); return err; } static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_ct_helper_obj *priv = nft_obj_data(obj); if (priv->helper4) nf_conntrack_helper_put(priv->helper4); if (priv->helper6) nf_conntrack_helper_put(priv->helper6); nf_ct_netns_put(ctx->net, ctx->family); } static void nft_ct_helper_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct_helper_obj *priv = nft_obj_data(obj); struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); struct nf_conntrack_helper *to_assign = NULL; struct nf_conn_help *help; if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct) || priv->l4proto != nf_ct_protonum(ct)) return; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: to_assign = priv->helper4; break; case NFPROTO_IPV6: to_assign = priv->helper6; break; default: WARN_ON_ONCE(1); return; } if (!to_assign) return; if (test_bit(IPS_HELPER_BIT, &ct->status)) return; help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) { rcu_assign_pointer(help->helper, to_assign); set_bit(IPS_HELPER_BIT, &ct->status); } } static int nft_ct_helper_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_helper_obj *priv = nft_obj_data(obj); const struct nf_conntrack_helper *helper; u16 family; if (priv->helper4 && priv->helper6) { family = NFPROTO_INET; helper = priv->helper4; } else if (priv->helper6) { family = NFPROTO_IPV6; helper = priv->helper6; } else { family = NFPROTO_IPV4; helper = priv->helper4; } if (nla_put_string(skb, NFTA_CT_HELPER_NAME, helper->name)) return -1; if (nla_put_u8(skb, NFTA_CT_HELPER_L4PROTO, priv->l4proto)) return -1; if (nla_put_be16(skb, NFTA_CT_HELPER_L3PROTO, htons(family))) return -1; return 0; } static const struct nla_policy nft_ct_helper_policy[NFTA_CT_HELPER_MAX + 1] = { [NFTA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, [NFTA_CT_HELPER_L3PROTO] = { .type = NLA_U16 }, [NFTA_CT_HELPER_L4PROTO] = { .type = NLA_U8 }, }; static struct nft_object_type nft_ct_helper_obj_type; static const struct nft_object_ops nft_ct_helper_obj_ops = { .type = &nft_ct_helper_obj_type, .size = sizeof(struct nft_ct_helper_obj), .eval = nft_ct_helper_obj_eval, .init = nft_ct_helper_obj_init, .destroy = nft_ct_helper_obj_destroy, .dump = nft_ct_helper_obj_dump, }; static struct nft_object_type nft_ct_helper_obj_type __read_mostly = { .type = NFT_OBJECT_CT_HELPER, .ops = &nft_ct_helper_obj_ops, .maxattr = NFTA_CT_HELPER_MAX, .policy = nft_ct_helper_policy, .owner = THIS_MODULE, }; struct nft_ct_expect_obj { u16 l3num; __be16 dport; u8 l4proto; u8 size; u32 timeout; }; static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_ct_expect_obj *priv = nft_obj_data(obj); if (!tb[NFTA_CT_EXPECT_L4PROTO] || !tb[NFTA_CT_EXPECT_DPORT] || !tb[NFTA_CT_EXPECT_TIMEOUT] || !tb[NFTA_CT_EXPECT_SIZE]) return -EINVAL; priv->l3num = ctx->family; if (tb[NFTA_CT_EXPECT_L3PROTO]) priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); switch (priv->l3num) { case NFPROTO_IPV4: case NFPROTO_IPV6: if (priv->l3num == ctx->family || ctx->family == NFPROTO_INET) break; return -EINVAL; case NFPROTO_INET: /* tuple.src.l3num supports NFPROTO_IPV4/6 only */ default: return -EAFNOSUPPORT; } priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); switch (priv->l4proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_DCCP: case IPPROTO_SCTP: break; default: return -EOPNOTSUPP; } priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); return nf_ct_netns_get(ctx->net, ctx->family); } static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { nf_ct_netns_put(ctx->net, ctx->family); } static int nft_ct_expect_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_expect_obj *priv = nft_obj_data(obj); if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) || nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) || nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) || nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) || nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size)) return -1; return 0; } static void nft_ct_expect_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct_expect_obj *priv = nft_obj_data(obj); struct nf_conntrack_expect *exp; enum ip_conntrack_info ctinfo; struct nf_conn_help *help; enum ip_conntrack_dir dir; u16 l3num = priv->l3num; struct nf_conn *ct; ct = nf_ct_get(pkt->skb, &ctinfo); if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) { regs->verdict.code = NFT_BREAK; return; } dir = CTINFO2DIR(ctinfo); help = nfct_help(ct); if (!help) help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (!help) { regs->verdict.code = NF_DROP; return; } if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) { regs->verdict.code = NFT_BREAK; return; } if (l3num == NFPROTO_INET) l3num = nf_ct_l3num(ct); exp = nf_ct_expect_alloc(ct); if (exp == NULL) { regs->verdict.code = NF_DROP; return; } nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num, &ct->tuplehash[!dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, priv->l4proto, NULL, &priv->dport); exp->timeout.expires = jiffies + priv->timeout * HZ; if (nf_ct_expect_related(exp, 0) != 0) regs->verdict.code = NF_DROP; } static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { [NFTA_CT_EXPECT_L3PROTO] = { .type = NLA_U16 }, [NFTA_CT_EXPECT_L4PROTO] = { .type = NLA_U8 }, [NFTA_CT_EXPECT_DPORT] = { .type = NLA_U16 }, [NFTA_CT_EXPECT_TIMEOUT] = { .type = NLA_U32 }, [NFTA_CT_EXPECT_SIZE] = { .type = NLA_U8 }, }; static struct nft_object_type nft_ct_expect_obj_type; static const struct nft_object_ops nft_ct_expect_obj_ops = { .type = &nft_ct_expect_obj_type, .size = sizeof(struct nft_ct_expect_obj), .eval = nft_ct_expect_obj_eval, .init = nft_ct_expect_obj_init, .destroy = nft_ct_expect_obj_destroy, .dump = nft_ct_expect_obj_dump, }; static struct nft_object_type nft_ct_expect_obj_type __read_mostly = { .type = NFT_OBJECT_CT_EXPECT, .ops = &nft_ct_expect_obj_ops, .maxattr = NFTA_CT_EXPECT_MAX, .policy = nft_ct_expect_policy, .owner = THIS_MODULE, }; static int __init nft_ct_module_init(void) { int err; BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > NFT_REG_SIZE); err = nft_register_expr(&nft_ct_type); if (err < 0) return err; err = nft_register_expr(&nft_notrack_type); if (err < 0) goto err1; err = nft_register_obj(&nft_ct_helper_obj_type); if (err < 0) goto err2; err = nft_register_obj(&nft_ct_expect_obj_type); if (err < 0) goto err3; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err = nft_register_obj(&nft_ct_timeout_obj_type); if (err < 0) goto err4; #endif return 0; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err4: nft_unregister_obj(&nft_ct_expect_obj_type); #endif err3: nft_unregister_obj(&nft_ct_helper_obj_type); err2: nft_unregister_expr(&nft_notrack_type); err1: nft_unregister_expr(&nft_ct_type); return err; } static void __exit nft_ct_module_exit(void) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT nft_unregister_obj(&nft_ct_timeout_obj_type); #endif nft_unregister_obj(&nft_ct_expect_obj_type); nft_unregister_obj(&nft_ct_helper_obj_type); nft_unregister_expr(&nft_notrack_type); nft_unregister_expr(&nft_ct_type); } module_init(nft_ct_module_init); module_exit(nft_ct_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_EXPR("ct"); MODULE_ALIAS_NFT_EXPR("notrack"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT); MODULE_DESCRIPTION("Netfilter nf_tables conntrack module"); |
5 23 27 21 16 21 7 16 23 19 7 25 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * sha512_base.h - core logic for SHA-512 implementations * * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> */ #ifndef _CRYPTO_SHA512_BASE_H #define _CRYPTO_SHA512_BASE_H #include <crypto/internal/hash.h> #include <crypto/sha2.h> #include <linux/crypto.h> #include <linux/module.h> #include <linux/string.h> #include <linux/unaligned.h> typedef void (sha512_block_fn)(struct sha512_state *sst, u8 const *src, int blocks); static inline int sha384_base_init(struct shash_desc *desc) { struct sha512_state *sctx = shash_desc_ctx(desc); sctx->state[0] = SHA384_H0; sctx->state[1] = SHA384_H1; sctx->state[2] = SHA384_H2; sctx->state[3] = SHA384_H3; sctx->state[4] = SHA384_H4; sctx->state[5] = SHA384_H5; sctx->state[6] = SHA384_H6; sctx->state[7] = SHA384_H7; sctx->count[0] = sctx->count[1] = 0; return 0; } static inline int sha512_base_init(struct shash_desc *desc) { struct sha512_state *sctx = shash_desc_ctx(desc); sctx->state[0] = SHA512_H0; sctx->state[1] = SHA512_H1; sctx->state[2] = SHA512_H2; sctx->state[3] = SHA512_H3; sctx->state[4] = SHA512_H4; sctx->state[5] = SHA512_H5; sctx->state[6] = SHA512_H6; sctx->state[7] = SHA512_H7; sctx->count[0] = sctx->count[1] = 0; return 0; } static inline int sha512_base_do_update(struct shash_desc *desc, const u8 *data, unsigned int len, sha512_block_fn *block_fn) { struct sha512_state *sctx = shash_desc_ctx(desc); unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE; sctx->count[0] += len; if (sctx->count[0] < len) sctx->count[1]++; if (unlikely((partial + len) >= SHA512_BLOCK_SIZE)) { int blocks; if (partial) { int p = SHA512_BLOCK_SIZE - partial; memcpy(sctx->buf + partial, data, p); data += p; len -= p; block_fn(sctx, sctx->buf, 1); } blocks = len / SHA512_BLOCK_SIZE; len %= SHA512_BLOCK_SIZE; if (blocks) { block_fn(sctx, data, blocks); data += blocks * SHA512_BLOCK_SIZE; } partial = 0; } if (len) memcpy(sctx->buf + partial, data, len); return 0; } static inline int sha512_base_do_finalize(struct shash_desc *desc, sha512_block_fn *block_fn) { const int bit_offset = SHA512_BLOCK_SIZE - sizeof(__be64[2]); struct sha512_state *sctx = shash_desc_ctx(desc); __be64 *bits = (__be64 *)(sctx->buf + bit_offset); unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE; sctx->buf[partial++] = 0x80; if (partial > bit_offset) { memset(sctx->buf + partial, 0x0, SHA512_BLOCK_SIZE - partial); partial = 0; block_fn(sctx, sctx->buf, 1); } memset(sctx->buf + partial, 0x0, bit_offset - partial); bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61); bits[1] = cpu_to_be64(sctx->count[0] << 3); block_fn(sctx, sctx->buf, 1); return 0; } static inline int sha512_base_finish(struct shash_desc *desc, u8 *out) { unsigned int digest_size = crypto_shash_digestsize(desc->tfm); struct sha512_state *sctx = shash_desc_ctx(desc); __be64 *digest = (__be64 *)out; int i; for (i = 0; digest_size > 0; i++, digest_size -= sizeof(__be64)) put_unaligned_be64(sctx->state[i], digest++); memzero_explicit(sctx, sizeof(*sctx)); return 0; } #endif /* _CRYPTO_SHA512_BASE_H */ |
9875 9889 9959 10655 57 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_CONTEXT_H #define __X86_KERNEL_FPU_CONTEXT_H #include <asm/fpu/xstate.h> #include <asm/trace/fpu.h> /* Functions related to FPU context tracking */ /* * The in-register FPU state for an FPU context on a CPU is assumed to be * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx * matches the FPU. * * If the FPU register state is valid, the kernel can skip restoring the * FPU state from memory. * * Any code that clobbers the FPU registers or updates the in-memory * FPU state for a task MUST let the rest of the kernel know that the * FPU registers are no longer valid for this task. * * Invalidate a resource you control: CPU if using the CPU for something else * (with preemption disabled), FPU for the current task, or a task that * is prevented from running by the current task. */ static inline void __cpu_invalidate_fpregs_state(void) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) { fpu->last_cpu = -1; } static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } static inline void fpregs_deactivate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { __this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } /* Internal helper for switch_fpu_return() and signal frame setup */ static inline void fpregs_restore_userregs(void) { struct fpu *fpu = ¤t->thread.fpu; int cpu = smp_processor_id(); if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) return; if (!fpregs_state_valid(fpu, cpu)) { /* * This restores _all_ xstate which has not been * established yet. * * If PKRU is enabled, then the PKRU value is already * correct because it was either set in switch_to() or in * flush_thread(). So it is excluded because it might be * not up to date in current->thread.fpu.xsave state. * * XFD state is handled in restore_fpregs_from_fpstate(). */ restore_fpregs_from_fpstate(fpu->fpstate, XFEATURE_MASK_FPSTATE); fpregs_activate(fpu); fpu->last_cpu = cpu; } clear_thread_flag(TIF_NEED_FPU_LOAD); } #endif |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H #include <linux/uaccess.h> #include <linux/ptrace.h> #include <asm/cpu_entry_area.h> #include <asm/switch_to.h> enum stack_type { STACK_TYPE_UNKNOWN, STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; struct stack_info { enum stack_type type; unsigned long *begin, *end, *next_sp; }; bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info); static __always_inline bool get_stack_guard_info(unsigned long *stack, struct stack_info *info) { /* make sure it's not in the stack proper */ if (get_stack_info_noinstr(stack, current, info)) return false; /* but if it is in the page below it, we hit a guard */ return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info); } const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { void *begin = info->begin; void *end = info->end; return (info->type != STACK_TYPE_UNKNOWN && addr >= begin && addr < end && addr + len > begin && addr + len <= end); } #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else #define STACKSLOTS_PER_LINE 4 #endif #ifdef CONFIG_FRAME_POINTER static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->bp; if (task == current) return __builtin_frame_address(0); return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { return NULL; } #endif /* CONFIG_FRAME_POINTER */ static inline unsigned long * get_stack_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->sp; if (task == current) return __builtin_frame_address(0); return (unsigned long *)task->thread.sp; } /* The form of the top of the frame on the stack */ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; struct stack_frame_ia32 { u32 next_frame; u32 return_address; }; void show_opcodes(struct pt_regs *regs, const char *loglvl); void show_ip(struct pt_regs *regs, const char *loglvl); #endif /* _ASM_X86_STACKTRACE_H */ |
2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match AH parameters. */ /* (C) 1999-2000 Yon Uriarte <yon@astaro.de> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/netfilter_ipv4/ipt_ah.h> #include <linux/netfilter/x_tables.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); MODULE_DESCRIPTION("Xtables: IPv4 IPsec-AH SPI match"); /* Returns 1 if the spi is matched by the range, 0 otherwise */ static inline bool spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) { bool r; pr_debug("spi_match:%c 0x%x <= 0x%x <= 0x%x\n", invert ? '!' : ' ', min, spi, max); r = (spi >= min && spi <= max) ^ invert; pr_debug(" result %s\n", r ? "PASS" : "FAILED"); return r; } static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct ip_auth_hdr _ahdr; const struct ip_auth_hdr *ah; const struct ipt_ah *ahinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; ah = skb_header_pointer(skb, par->thoff, sizeof(_ahdr), &_ahdr); if (ah == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ pr_debug("Dropping evil AH tinygram.\n"); par->hotdrop = true; return false; } return spi_match(ahinfo->spis[0], ahinfo->spis[1], ntohl(ah->spi), !!(ahinfo->invflags & IPT_AH_INV_SPI)); } static int ah_mt_check(const struct xt_mtchk_param *par) { const struct ipt_ah *ahinfo = par->matchinfo; /* Must specify no unknown invflags */ if (ahinfo->invflags & ~IPT_AH_INV_MASK) { pr_debug("unknown flags %X\n", ahinfo->invflags); return -EINVAL; } return 0; } static struct xt_match ah_mt_reg __read_mostly = { .name = "ah", .family = NFPROTO_IPV4, .match = ah_mt, .matchsize = sizeof(struct ipt_ah), .proto = IPPROTO_AH, .checkentry = ah_mt_check, .me = THIS_MODULE, }; static int __init ah_mt_init(void) { return xt_register_match(&ah_mt_reg); } static void __exit ah_mt_exit(void) { xt_unregister_match(&ah_mt_reg); } module_init(ah_mt_init); module_exit(ah_mt_exit); |
6 3 1 3 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IP6 tables REJECT target module * Linux INET6 implementation * * Copyright (C)2003 USAGI/WIDE Project * * Authors: * Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp> * * Copyright (c) 2005-2007 Patrick McHardy <kaber@trash.net> * * Based on net/ipv4/netfilter/ipt_REJECT.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/gfp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmpv6.h> #include <linux/netdevice.h> #include <net/icmp.h> #include <net/flow.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_REJECT.h> #include <net/netfilter/ipv6/nf_reject.h> MODULE_AUTHOR("Yasuyuki KOZAKAI <yasuyuki.kozakai@toshiba.co.jp>"); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6"); MODULE_LICENSE("GPL"); static unsigned int reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_reject_info *reject = par->targinfo; struct net *net = xt_net(par); switch (reject->with) { case IP6T_ICMP6_NO_ROUTE: nf_send_unreach6(net, skb, ICMPV6_NOROUTE, xt_hooknum(par)); break; case IP6T_ICMP6_ADM_PROHIBITED: nf_send_unreach6(net, skb, ICMPV6_ADM_PROHIBITED, xt_hooknum(par)); break; case IP6T_ICMP6_NOT_NEIGHBOUR: nf_send_unreach6(net, skb, ICMPV6_NOT_NEIGHBOUR, xt_hooknum(par)); break; case IP6T_ICMP6_ADDR_UNREACH: nf_send_unreach6(net, skb, ICMPV6_ADDR_UNREACH, xt_hooknum(par)); break; case IP6T_ICMP6_PORT_UNREACH: nf_send_unreach6(net, skb, ICMPV6_PORT_UNREACH, xt_hooknum(par)); break; case IP6T_ICMP6_ECHOREPLY: /* Do nothing */ break; case IP6T_TCP_RESET: nf_send_reset6(net, par->state->sk, skb, xt_hooknum(par)); break; case IP6T_ICMP6_POLICY_FAIL: nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par)); break; case IP6T_ICMP6_REJECT_ROUTE: nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE, xt_hooknum(par)); break; } return NF_DROP; } static int reject_tg6_check(const struct xt_tgchk_param *par) { const struct ip6t_reject_info *rejinfo = par->targinfo; const struct ip6t_entry *e = par->entryinfo; if (rejinfo->with == IP6T_ICMP6_ECHOREPLY) { pr_info_ratelimited("ECHOREPLY is not supported\n"); return -EINVAL; } else if (rejinfo->with == IP6T_TCP_RESET) { /* Must specify that it's a TCP packet */ if (!(e->ipv6.flags & IP6T_F_PROTO) || e->ipv6.proto != IPPROTO_TCP || (e->ipv6.invflags & XT_INV_PROTO)) { pr_info_ratelimited("TCP_RESET illegal for non-tcp\n"); return -EINVAL; } } return 0; } static struct xt_target reject_tg6_reg __read_mostly = { .name = "REJECT", .family = NFPROTO_IPV6, .target = reject_tg6, .targetsize = sizeof(struct ip6t_reject_info), .table = "filter", .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT), .checkentry = reject_tg6_check, .me = THIS_MODULE }; static int __init reject_tg6_init(void) { return xt_register_target(&reject_tg6_reg); } static void __exit reject_tg6_exit(void) { xt_unregister_target(&reject_tg6_reg); } module_init(reject_tg6_init); module_exit(reject_tg6_exit); |
21 8 66 17 11 37 14 31 10 13 22 8 19 7 5 2 19 19 4 22 4 21 22 22 2 23 23 23 18 18 20 18 22 22 6 22 23 1 22 23 23 2 21 17 7 15 2 13 2 11 7 6 1 6 13 17 17 1 16 12 13 12 9 3 16 28 29 1 29 28 28 29 21 21 3 19 16 21 18 18 1 18 10 1 8 18 22 7 15 23 23 1 22 18 18 7 18 10 1 10 8 2 2 2 8 9 9 9 9 9 9 9 8 9 9 9 23 50 13 34 34 35 29 33 28 7 34 46 46 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 | /* * net/tipc/name_table.c: TIPC name table code * * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2008, 2010-2014, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/sock.h> #include <linux/list_sort.h> #include <linux/rbtree_augmented.h> #include "core.h" #include "netlink.h" #include "name_table.h" #include "name_distr.h" #include "subscr.h" #include "bcast.h" #include "addr.h" #include "node.h" #include "group.h" /** * struct service_range - container for all bindings of a service range * @lower: service range lower bound * @upper: service range upper bound * @tree_node: member of service range RB tree * @max: largest 'upper' in this node subtree * @local_publ: list of identical publications made from this node * Used by closest_first lookup and multicast lookup algorithm * @all_publ: all publications identical to this one, whatever node and scope * Used by round-robin lookup algorithm */ struct service_range { u32 lower; u32 upper; struct rb_node tree_node; u32 max; struct list_head local_publ; struct list_head all_publ; }; /** * struct tipc_service - container for all published instances of a service type * @type: 32 bit 'type' value for service * @publ_cnt: increasing counter for publications in this service * @ranges: rb tree containing all service ranges for this service * @service_list: links to adjacent name ranges in hash chain * @subscriptions: list of subscriptions for this service type * @lock: spinlock controlling access to pertaining service ranges/publications * @rcu: RCU callback head used for deferred freeing */ struct tipc_service { u32 type; u32 publ_cnt; struct rb_root ranges; struct hlist_node service_list; struct list_head subscriptions; spinlock_t lock; /* Covers service range list */ struct rcu_head rcu; }; #define service_range_upper(sr) ((sr)->upper) RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, struct service_range, tree_node, u32, max, service_range_upper) #define service_range_entry(rbtree_node) \ (container_of(rbtree_node, struct service_range, tree_node)) #define service_range_overlap(sr, start, end) \ ((sr)->lower <= (end) && (sr)->upper >= (start)) /** * service_range_foreach_match - iterate over tipc service rbtree for each * range match * @sr: the service range pointer as a loop cursor * @sc: the pointer to tipc service which holds the service range rbtree * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching */ #define service_range_foreach_match(sr, sc, start, end) \ for (sr = service_range_match_first((sc)->ranges.rb_node, \ start, \ end); \ sr; \ sr = service_range_match_next(&(sr)->tree_node, \ start, \ end)) /** * service_range_match_first - find first service range matching a range * @n: the root node of service range rbtree for searching * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching * * Return: the leftmost service range node in the rbtree that overlaps the * specific range if any. Otherwise, returns NULL. */ static struct service_range *service_range_match_first(struct rb_node *n, u32 start, u32 end) { struct service_range *sr; struct rb_node *l, *r; /* Non overlaps in tree at all? */ if (!n || service_range_entry(n)->max < start) return NULL; while (n) { l = n->rb_left; if (l && service_range_entry(l)->max >= start) { /* A leftmost overlap range node must be one in the left * subtree. If not, it has lower > end, then nodes on * the right side cannot satisfy the condition either. */ n = l; continue; } /* No one in the left subtree can match, return if this node is * an overlap i.e. leftmost. */ sr = service_range_entry(n); if (service_range_overlap(sr, start, end)) return sr; /* Ok, try to lookup on the right side */ r = n->rb_right; if (sr->lower <= end && r && service_range_entry(r)->max >= start) { n = r; continue; } break; } return NULL; } /** * service_range_match_next - find next service range matching a range * @n: a node in service range rbtree from which the searching starts * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching * * Return: the next service range node to the given node in the rbtree that * overlaps the specific range if any. Otherwise, returns NULL. */ static struct service_range *service_range_match_next(struct rb_node *n, u32 start, u32 end) { struct service_range *sr; struct rb_node *p, *r; while (n) { r = n->rb_right; if (r && service_range_entry(r)->max >= start) /* A next overlap range node must be one in the right * subtree. If not, it has lower > end, then any next * successor (- an ancestor) of this node cannot * satisfy the condition either. */ return service_range_match_first(r, start, end); /* No one in the right subtree can match, go up to find an * ancestor of this node which is parent of a left-hand child. */ while ((p = rb_parent(n)) && n == p->rb_right) n = p; if (!p) break; /* Return if this ancestor is an overlap */ sr = service_range_entry(p); if (service_range_overlap(sr, start, end)) return sr; /* Ok, try to lookup more from this ancestor */ if (sr->lower <= end) { n = p; continue; } break; } return NULL; } static int hash(int x) { return x & (TIPC_NAMETBL_SIZE - 1); } /** * tipc_publ_create - create a publication structure * @ua: the service range the user is binding to * @sk: the address of the socket that is bound * @key: publication key */ static struct publication *tipc_publ_create(struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct publication *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return NULL; p->sr = ua->sr; p->sk = *sk; p->scope = ua->scope; p->key = key; INIT_LIST_HEAD(&p->binding_sock); INIT_LIST_HEAD(&p->binding_node); INIT_LIST_HEAD(&p->local_publ); INIT_LIST_HEAD(&p->all_publ); INIT_LIST_HEAD(&p->list); return p; } /** * tipc_service_create - create a service structure for the specified 'type' * @net: network namespace * @ua: address representing the service to be bound * * Allocates a single range structure and sets it to all 0's. */ static struct tipc_service *tipc_service_create(struct net *net, struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct tipc_service *service; struct hlist_head *hd; service = kzalloc(sizeof(*service), GFP_ATOMIC); if (!service) { pr_warn("Service creation failed, no memory\n"); return NULL; } spin_lock_init(&service->lock); service->type = ua->sr.type; service->ranges = RB_ROOT; INIT_HLIST_NODE(&service->service_list); INIT_LIST_HEAD(&service->subscriptions); hd = &nt->services[hash(ua->sr.type)]; hlist_add_head_rcu(&service->service_list, hd); return service; } /* tipc_service_find_range - find service range matching publication parameters */ static struct service_range *tipc_service_find_range(struct tipc_service *sc, struct tipc_uaddr *ua) { struct service_range *sr; service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { /* Look for exact match */ if (sr->lower == ua->sr.lower && sr->upper == ua->sr.upper) return sr; } return NULL; } static struct service_range *tipc_service_create_range(struct tipc_service *sc, struct publication *p) { struct rb_node **n, *parent = NULL; struct service_range *sr; u32 lower = p->sr.lower; u32 upper = p->sr.upper; n = &sc->ranges.rb_node; while (*n) { parent = *n; sr = service_range_entry(parent); if (lower == sr->lower && upper == sr->upper) return sr; if (sr->max < upper) sr->max = upper; if (lower <= sr->lower) n = &parent->rb_left; else n = &parent->rb_right; } sr = kzalloc(sizeof(*sr), GFP_ATOMIC); if (!sr) return NULL; sr->lower = lower; sr->upper = upper; sr->max = upper; INIT_LIST_HEAD(&sr->local_publ); INIT_LIST_HEAD(&sr->all_publ); rb_link_node(&sr->tree_node, parent, n); rb_insert_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); return sr; } static bool tipc_service_insert_publ(struct net *net, struct tipc_service *sc, struct publication *p) { struct tipc_subscription *sub, *tmp; struct service_range *sr; struct publication *_p; u32 node = p->sk.node; bool first = false; bool res = false; u32 key = p->key; spin_lock_bh(&sc->lock); sr = tipc_service_create_range(sc, p); if (!sr) goto exit; first = list_empty(&sr->all_publ); /* Return if the publication already exists */ list_for_each_entry(_p, &sr->all_publ, all_publ) { if (_p->key == key && (!_p->sk.node || _p->sk.node == node)) { pr_debug("Failed to bind duplicate %u,%u,%u/%u:%u/%u\n", p->sr.type, p->sr.lower, p->sr.upper, node, p->sk.ref, key); goto exit; } } if (in_own_node(net, p->sk.node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); p->id = sc->publ_cnt++; /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, first); } res = true; exit: if (!res) pr_warn("Failed to bind to %u,%u,%u\n", p->sr.type, p->sr.lower, p->sr.upper); spin_unlock_bh(&sc->lock); return res; } /** * tipc_service_remove_publ - remove a publication from a service * @r: service_range to remove publication from * @sk: address publishing socket * @key: target publication key */ static struct publication *tipc_service_remove_publ(struct service_range *r, struct tipc_socket_addr *sk, u32 key) { struct publication *p; u32 node = sk->node; list_for_each_entry(p, &r->all_publ, all_publ) { if (p->key != key || (node && node != p->sk.node)) continue; list_del(&p->all_publ); list_del(&p->local_publ); return p; } return NULL; } /* * Code reused: time_after32() for the same purpose */ #define publication_after(pa, pb) time_after32((pa)->id, (pb)->id) static int tipc_publ_sort(void *priv, const struct list_head *a, const struct list_head *b) { struct publication *pa, *pb; pa = container_of(a, struct publication, list); pb = container_of(b, struct publication, list); return publication_after(pa, pb); } /** * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service * range overlapping with the requested range * @service: the tipc_service to attach the @sub to * @sub: the subscription to attach */ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { struct publication *p, *first, *tmp; struct list_head publ_list; struct service_range *sr; u32 filter, lower, upper; filter = sub->s.filter; lower = sub->s.seq.lower; upper = sub->s.seq.upper; tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); if (filter & TIPC_SUB_NO_STATUS) return; INIT_LIST_HEAD(&publ_list); service_range_foreach_match(sr, service, lower, upper) { first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { if (filter & TIPC_SUB_PORTS) list_add_tail(&p->list, &publ_list); else if (!first || publication_after(first, p)) /* Pick this range's *first* publication */ first = p; } if (first) list_add_tail(&first->list, &publ_list); } /* Sort the publications before reporting */ list_sort(NULL, &publ_list, tipc_publ_sort); list_for_each_entry_safe(p, tmp, &publ_list, list) { tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, true); list_del_init(&p->list); } } static struct tipc_service *tipc_service_find(struct net *net, struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct hlist_head *service_head; struct tipc_service *service; service_head = &nt->services[hash(ua->sr.type)]; hlist_for_each_entry_rcu(service, service_head, service_list) { if (service->type == ua->sr.type) return service; } return NULL; }; struct publication *tipc_nametbl_insert_publ(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct tipc_service *sc; struct publication *p; p = tipc_publ_create(ua, sk, key); if (!p) return NULL; sc = tipc_service_find(net, ua); if (!sc) sc = tipc_service_create(net, ua); if (sc && tipc_service_insert_publ(net, sc, p)) return p; kfree(p); return NULL; } struct publication *tipc_nametbl_remove_publ(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct tipc_subscription *sub, *tmp; struct publication *p = NULL; struct service_range *sr; struct tipc_service *sc; bool last; sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); sr = tipc_service_find_range(sc, ua); if (!sr) goto unlock; p = tipc_service_remove_publ(sr, sk, key); if (!p) goto unlock; /* Notify any waiting subscriptions */ last = list_empty(&sr->all_publ); list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { tipc_sub_report_overlap(sub, p, TIPC_WITHDRAWN, last); } /* Remove service range item if this was its last publication */ if (list_empty(&sr->all_publ)) { rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } unlock: spin_unlock_bh(&sc->lock); exit: if (!p) { pr_err("Failed to remove unknown binding: %u,%u,%u/%u:%u/%u\n", ua->sr.type, ua->sr.lower, ua->sr.upper, sk->node, sk->ref, key); } return p; } /** * tipc_nametbl_lookup_anycast - perform service instance to socket translation * @net: network namespace * @ua: service address to look up * @sk: address to socket we want to find * * On entry, a non-zero 'sk->node' indicates the node where we want lookup to be * performed, which may not be this one. * * On exit: * * - If lookup is deferred to another node, leave 'sk->node' unchanged and * return 'true'. * - If lookup is successful, set the 'sk->node' and 'sk->ref' (== portid) which * represent the bound socket and return 'true'. * - If lookup fails, return 'false' * * Note that for legacy users (node configured with Z.C.N address format) the * 'closest-first' lookup algorithm must be maintained, i.e., if sk.node is 0 * we must look in the local binding list first */ bool tipc_nametbl_lookup_anycast(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk) { struct tipc_net *tn = tipc_net(net); bool legacy = tn->legacy_addr_format; u32 self = tipc_own_addr(net); u32 inst = ua->sa.instance; struct service_range *r; struct tipc_service *sc; struct publication *p; struct list_head *l; bool res = false; if (!tipc_in_scope(legacy, sk->node, self)) return true; rcu_read_lock(); sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(r, sc, inst, inst) { /* Select lookup algo: local, closest-first or round-robin */ if (sk->node == self) { l = &r->local_publ; if (list_empty(l)) continue; p = list_first_entry(l, struct publication, local_publ); list_move_tail(&p->local_publ, &r->local_publ); } else if (legacy && !sk->node && !list_empty(&r->local_publ)) { l = &r->local_publ; p = list_first_entry(l, struct publication, local_publ); list_move_tail(&p->local_publ, &r->local_publ); } else { l = &r->all_publ; p = list_first_entry(l, struct publication, all_publ); list_move_tail(&p->all_publ, &r->all_publ); } *sk = p->sk; res = true; /* Todo: as for legacy, pick the first matching range only, a * "true" round-robin will be performed as needed. */ break; } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return res; } /* tipc_nametbl_lookup_group(): lookup destinaton(s) in a communication group * Returns a list of one (== group anycast) or more (== group multicast) * destination socket/node pairs matching the given address. * The requester may or may not want to exclude himself from the list. */ bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua, struct list_head *dsts, int *dstcnt, u32 exclude, bool mcast) { u32 self = tipc_own_addr(net); u32 inst = ua->sa.instance; struct service_range *sr; struct tipc_service *sc; struct publication *p; *dstcnt = 0; rcu_read_lock(); sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); /* Todo: a full search i.e. service_range_foreach_match() instead? */ sr = service_range_match_first(sc->ranges.rb_node, inst, inst); if (!sr) goto no_match; list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != ua->scope) continue; if (p->sk.ref == exclude && p->sk.node == self) continue; tipc_dest_push(dsts, p->sk.node, p->sk.ref); (*dstcnt)++; if (mcast) continue; list_move_tail(&p->all_publ, &sr->all_publ); break; } no_match: spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return !list_empty(dsts); } /* tipc_nametbl_lookup_mcast_sockets(): look up node local destinaton sockets * matching the given address * Used on nodes which have received a multicast/broadcast message * Returns a list of local sockets */ void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, struct list_head *dports) { struct service_range *sr; struct tipc_service *sc; struct publication *p; u8 scope = ua->scope; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { if (scope == p->scope || scope == TIPC_ANY_SCOPE) tipc_dest_push(dports, 0, p->sk.ref); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_lookup_mcast_nodes(): look up all destination nodes matching * the given address. Used in sending node. * Used on nodes which are sending out a multicast/broadcast message * Returns a list of nodes, including own node if applicable */ void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua, struct tipc_nlist *nodes) { struct service_range *sr; struct tipc_service *sc; struct publication *p; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->all_publ, all_publ) { tipc_nlist_add(nodes, p->sk.node); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_build_group - build list of communication group members */ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, struct tipc_uaddr *ua) { struct service_range *sr; struct tipc_service *sc; struct publication *p; struct rb_node *n; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != ua->scope) continue; tipc_group_add_member(grp, p->sk.node, p->sk.ref, p->sr.lower); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_publish - add service binding to name table */ struct publication *tipc_nametbl_publish(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *p = NULL; struct sk_buff *skb = NULL; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); if (nt->local_publ_count >= TIPC_MAX_PUBL) { pr_warn("Bind failed, max limit %u reached\n", TIPC_MAX_PUBL); goto exit; } p = tipc_nametbl_insert_publ(net, ua, sk, key); if (p) { nt->local_publ_count++; skb = tipc_named_publish(net, p); } rc_dests = nt->rc_dests; exit: spin_unlock_bh(&tn->nametbl_lock); if (skb) tipc_node_broadcast(net, skb, rc_dests); return p; } /** * tipc_nametbl_withdraw - withdraw a service binding * @net: network namespace * @ua: service address/range being unbound * @sk: address of the socket being unbound from * @key: target publication key */ void tipc_nametbl_withdraw(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct sk_buff *skb = NULL; struct publication *p; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); p = tipc_nametbl_remove_publ(net, ua, sk, key); if (p) { nt->local_publ_count--; skb = tipc_named_withdraw(net, p); list_del_init(&p->binding_sock); kfree_rcu(p, rcu); } rc_dests = nt->rc_dests; spin_unlock_bh(&tn->nametbl_lock); if (skb) tipc_node_broadcast(net, skb, rc_dests); } /** * tipc_nametbl_subscribe - add a subscription object to the name table * @sub: subscription to add */ bool tipc_nametbl_subscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); u32 type = sub->s.seq.type; struct tipc_service *sc; struct tipc_uaddr ua; bool res = true; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); sc = tipc_service_find(sub->net, &ua); if (!sc) sc = tipc_service_create(sub->net, &ua); if (sc) { spin_lock_bh(&sc->lock); tipc_service_subscribe(sc, sub); spin_unlock_bh(&sc->lock); } else { pr_warn("Failed to subscribe for {%u,%u,%u}\n", type, sub->s.seq.lower, sub->s.seq.upper); res = false; } spin_unlock_bh(&tn->nametbl_lock); return res; } /** * tipc_nametbl_unsubscribe - remove a subscription object from name table * @sub: subscription to remove */ void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); struct tipc_service *sc; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, sub->s.seq.type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); sc = tipc_service_find(sub->net, &ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); list_del_init(&sub->service_list); tipc_sub_put(sub); /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } spin_unlock_bh(&sc->lock); exit: spin_unlock_bh(&tn->nametbl_lock); } int tipc_nametbl_init(struct net *net) { struct tipc_net *tn = tipc_net(net); struct name_table *nt; int i; nt = kzalloc(sizeof(*nt), GFP_KERNEL); if (!nt) return -ENOMEM; for (i = 0; i < TIPC_NAMETBL_SIZE; i++) INIT_HLIST_HEAD(&nt->services[i]); INIT_LIST_HEAD(&nt->node_scope); INIT_LIST_HEAD(&nt->cluster_scope); rwlock_init(&nt->cluster_scope_lock); tn->nametbl = nt; spin_lock_init(&tn->nametbl_lock); return 0; } /** * tipc_service_delete - purge all publications for a service and delete it * @net: the associated network namespace * @sc: tipc_service to delete */ static void tipc_service_delete(struct net *net, struct tipc_service *sc) { struct service_range *sr, *tmpr; struct publication *p, *tmp; spin_lock_bh(&sc->lock); rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) { list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) { tipc_service_remove_publ(sr, &p->sk, p->key); kfree_rcu(p, rcu); } rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } hlist_del_init_rcu(&sc->service_list); spin_unlock_bh(&sc->lock); kfree_rcu(sc, rcu); } void tipc_nametbl_stop(struct net *net) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct hlist_head *service_head; struct tipc_service *service; u32 i; /* Verify name table is empty and purge any lingering * publications, then release the name table */ spin_lock_bh(&tn->nametbl_lock); for (i = 0; i < TIPC_NAMETBL_SIZE; i++) { if (hlist_empty(&nt->services[i])) continue; service_head = &nt->services[i]; hlist_for_each_entry_rcu(service, service_head, service_list) { tipc_service_delete(net, service); } } spin_unlock_bh(&tn->nametbl_lock); synchronize_net(); kfree(nt); } static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, struct tipc_service *service, struct service_range *sr, u32 *last_key) { struct publication *p; struct nlattr *attrs; struct nlattr *b; void *hdr; if (*last_key) { list_for_each_entry(p, &sr->all_publ, all_publ) if (p->key == *last_key) break; if (list_entry_is_head(p, &sr->all_publ, all_publ)) return -EPIPE; } else { p = list_first_entry(&sr->all_publ, struct publication, all_publ); } list_for_each_entry_from(p, &sr->all_publ, all_publ) { *last_key = p->key; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NAME_TABLE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE); if (!attrs) goto msg_full; b = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); if (!b) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, service->type)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sr->lower)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sr->upper)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->sk.node)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->sk.ref)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key)) goto publ_msg_full; nla_nest_end(msg->skb, b); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); } *last_key = 0; return 0; publ_msg_full: nla_nest_cancel(msg->skb, b); attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static int __tipc_nl_service_range_list(struct tipc_nl_msg *msg, struct tipc_service *sc, u32 *last_lower, u32 *last_key) { struct service_range *sr; struct rb_node *n; int err; for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); if (sr->lower < *last_lower) continue; err = __tipc_nl_add_nametable_publ(msg, sc, sr, last_key); if (err) { *last_lower = sr->lower; return err; } } *last_lower = 0; return 0; } static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg, u32 *last_type, u32 *last_lower, u32 *last_key) { struct tipc_net *tn = tipc_net(net); struct tipc_service *service = NULL; struct hlist_head *head; struct tipc_uaddr ua; int err; int i; if (*last_type) i = hash(*last_type); else i = 0; for (; i < TIPC_NAMETBL_SIZE; i++) { head = &tn->nametbl->services[i]; if (*last_type || (!i && *last_key && (*last_lower == *last_key))) { tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, *last_type, *last_lower, *last_lower); service = tipc_service_find(net, &ua); if (!service) return -EPIPE; } else { hlist_for_each_entry_rcu(service, head, service_list) break; if (!service) continue; } hlist_for_each_entry_from_rcu(service, service_list) { spin_lock_bh(&service->lock); err = __tipc_nl_service_range_list(msg, service, last_lower, last_key); if (err) { *last_type = service->type; spin_unlock_bh(&service->lock); return err; } spin_unlock_bh(&service->lock); } *last_type = 0; } return 0; } int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 last_type = cb->args[0]; u32 last_lower = cb->args[1]; u32 last_key = cb->args[2]; int done = cb->args[3]; struct tipc_nl_msg msg; int err; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); err = tipc_nl_service_list(net, &msg, &last_type, &last_lower, &last_key); if (!err) { done = 1; } else if (err != -EMSGSIZE) { /* We never set seq or call nl_dump_check_consistent() this * means that setting prev_seq here will cause the consistence * check to fail in the netlink callback handler. Resulting in * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if * we got an error. */ cb->prev_seq = 1; } rcu_read_unlock(); cb->args[0] = last_type; cb->args[1] = last_lower; cb->args[2] = last_key; cb->args[3] = done; return skb->len; } struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; list_for_each_entry(dst, l, list) { if (dst->node == node && dst->port == port) return dst; } return NULL; } bool tipc_dest_push(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; if (tipc_dest_find(l, node, port)) return false; dst = kmalloc(sizeof(*dst), GFP_ATOMIC); if (unlikely(!dst)) return false; dst->node = node; dst->port = port; list_add(&dst->list, l); return true; } bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port) { struct tipc_dest *dst; if (list_empty(l)) return false; dst = list_first_entry(l, typeof(*dst), list); if (port) *port = dst->port; if (node) *node = dst->node; list_del(&dst->list); kfree(dst); return true; } bool tipc_dest_del(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; dst = tipc_dest_find(l, node, port); if (!dst) return false; list_del(&dst->list); kfree(dst); return true; } void tipc_dest_list_purge(struct list_head *l) { struct tipc_dest *dst, *tmp; list_for_each_entry_safe(dst, tmp, l, list) { list_del(&dst->list); kfree(dst); } } |
604 606 276 273 273 277 276 9873 9868 9866 9859 569 565 565 451 271 569 569 568 568 566 84 84 85 85 85 85 10619 10634 23339 10619 19141 77 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 */ #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/sched.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> #include <asm/irq_regs.h> #include <uapi/asm/kvm.h> #include <linux/hardirq.h> #include <linux/pkeys.h> #include <linux/vmalloc.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" #define CREATE_TRACE_POINTS #include <asm/trace/fpu.h> #ifdef CONFIG_X86_64 DEFINE_STATIC_KEY_FALSE(__fpu_state_size_dynamic); DEFINE_PER_CPU(u64, xfd_state); #endif /* The FPU state configuration data for kernel and user space */ struct fpu_state_config fpu_kernel_cfg __ro_after_init; struct fpu_state_config fpu_user_cfg __ro_after_init; /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ struct fpstate init_fpstate __ro_after_init; /* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* * Track which context is using the FPU on the CPU: */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? */ bool irq_fpu_usable(void) { if (WARN_ON_ONCE(in_nmi())) return false; /* In kernel FPU usage already active? */ if (this_cpu_read(in_kernel_fpu)) return false; /* * When not in NMI or hard interrupt context, FPU can be used in: * * - Task context except from within fpregs_lock()'ed critical * regions. * * - Soft interrupt processing context which cannot happen * while in a fpregs_lock()'ed critical region. */ if (!in_hardirq()) return true; /* * In hard interrupt context it's safe when soft interrupts * are enabled, which means the interrupt did not hit in * a fpregs_lock()'ed critical region. */ return !softirq_count(); } EXPORT_SYMBOL(irq_fpu_usable); /* * Track AVX512 state use because it is known to slow the max clock * speed of the core. */ static void update_avx_timestamp(struct fpu *fpu) { #define AVX512_TRACKING_MASK (XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM) if (fpu->fpstate->regs.xsave.header.xfeatures & AVX512_TRACKING_MASK) fpu->avx512_timestamp = jiffies; } /* * Save the FPU register state in fpu->fpstate->regs. The register state is * preserved. * * Must be called with fpregs_lock() held. * * The legacy FNSAVE instruction clears all FPU state unconditionally, so * register state has to be reloaded. That might be a pointless exercise * when the FPU is going to be used by another task right after that. But * this only affects 20+ years old 32bit systems and avoids conditionals all * over the place. * * FXSAVE and all XSAVE variants preserve the FPU register state. */ void save_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { os_xsave(fpu->fpstate); update_avx_timestamp(fpu); return; } if (likely(use_fxsr())) { fxsave(&fpu->fpstate->regs.fxsave); return; } /* * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to reload them from the memory state. */ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->fpstate->regs.fsave)); frstor(&fpu->fpstate->regs.fsave); } void restore_fpregs_from_fpstate(struct fpstate *fpstate, u64 mask) { /* * AMD K7/K8 and later CPUs up to Zen don't save/restore * FDP/FIP/FOP unless an exception is pending. Clear the x87 state * here by setting it to fixed values. "m" is a random variable * that should be in L1. */ if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" "fildl %[addr]" /* set F?P to defined value */ : : [addr] "m" (*fpstate)); } if (use_xsave()) { /* * Dynamically enabled features are enabled in XCR0, but * usage requires also that the corresponding bits in XFD * are cleared. If the bits are set then using a related * instruction will raise #NM. This allows to do the * allocation of the larger FPU buffer lazy from #NM or if * the task has no permission to kill it which would happen * via #UD if the feature is disabled in XCR0. * * XFD state is following the same life time rules as * XSTATE and to restore state correctly XFD has to be * updated before XRSTORS otherwise the component would * stay in or go into init state even if the bits are set * in fpstate::regs::xsave::xfeatures. */ xfd_update_state(fpstate); /* * Restoring state always needs to modify all features * which are in @mask even if the current task cannot use * extended features. * * So fpstate->xfeatures cannot be used here, because then * a feature for which the task has no permission but was * used by the previous task would not go into init state. */ mask = fpu_kernel_cfg.max_features & mask; os_xrstor(fpstate, mask); } else { if (use_fxsr()) fxrstor(&fpstate->regs.fxsave); else frstor(&fpstate->regs.fsave); } } void fpu_reset_from_exception_fixup(void) { restore_fpregs_from_fpstate(&init_fpstate, XFEATURE_MASK_FPSTATE); } #if IS_ENABLED(CONFIG_KVM) static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); static void fpu_init_guest_permissions(struct fpu_guest *gfpu) { struct fpu_state_perm *fpuperm; u64 perm; if (!IS_ENABLED(CONFIG_X86_64)) return; spin_lock_irq(¤t->sighand->siglock); fpuperm = ¤t->group_leader->thread.fpu.guest_perm; perm = fpuperm->__state_perm; /* First fpstate allocation locks down permissions. */ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); spin_unlock_irq(¤t->sighand->siglock); gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED; } bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) { struct fpstate *fpstate; unsigned int size; size = fpu_user_cfg.default_size + ALIGN(offsetof(struct fpstate, regs), 64); fpstate = vzalloc(size); if (!fpstate) return false; /* Leave xfd to 0 (the reset value defined by spec) */ __fpstate_reset(fpstate, 0); fpstate_init_user(fpstate); fpstate->is_valloc = true; fpstate->is_guest = true; gfpu->fpstate = fpstate; gfpu->xfeatures = fpu_user_cfg.default_features; gfpu->perm = fpu_user_cfg.default_features; /* * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state * to userspace, even when XSAVE is unsupported, so that restoring FPU * state on a different CPU that does support XSAVE can cleanly load * the incoming state using its natural XSAVE. In other words, KVM's * uABI size may be larger than this host's default size. Conversely, * the default size should never be larger than KVM's base uABI size; * all features that can expand the uABI size must be opt-in. */ gfpu->uabi_size = sizeof(struct kvm_xsave); if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) gfpu->uabi_size = fpu_user_cfg.default_size; fpu_init_guest_permissions(gfpu); return true; } EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); void fpu_free_guest_fpstate(struct fpu_guest *gfpu) { struct fpstate *fps = gfpu->fpstate; if (!fps) return; if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) return; gfpu->fpstate = NULL; vfree(fps); } EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); /* * fpu_enable_guest_xfd_features - Check xfeatures against guest perm and enable * @guest_fpu: Pointer to the guest FPU container * @xfeatures: Features requested by guest CPUID * * Enable all dynamic xfeatures according to guest perm and requested CPUID. * * Return: 0 on success, error code otherwise */ int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures) { lockdep_assert_preemption_enabled(); /* Nothing to do if all requested features are already enabled. */ xfeatures &= ~guest_fpu->xfeatures; if (!xfeatures) return 0; return __xfd_enable_feature(xfeatures, guest_fpu); } EXPORT_SYMBOL_GPL(fpu_enable_guest_xfd_features); #ifdef CONFIG_X86_64 void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { fpregs_lock(); guest_fpu->fpstate->xfd = xfd; if (guest_fpu->fpstate->in_use) xfd_update_state(guest_fpu->fpstate); fpregs_unlock(); } EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); /** * fpu_sync_guest_vmexit_xfd_state - Synchronize XFD MSR and software state * * Must be invoked from KVM after a VMEXIT before enabling interrupts when * XFD write emulation is disabled. This is required because the guest can * freely modify XFD and the state at VMEXIT is not guaranteed to be the * same as the state on VMENTER. So software state has to be updated before * any operation which depends on it can take place. * * Note: It can be invoked unconditionally even when write emulation is * enabled for the price of a then pointless MSR read. */ void fpu_sync_guest_vmexit_xfd_state(void) { struct fpstate *fps = current->thread.fpu.fpstate; lockdep_assert_irqs_disabled(); if (fpu_state_size_dynamic()) { rdmsrl(MSR_IA32_XFD, fps->xfd); __this_cpu_write(xfd_state, fps->xfd); } } EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); #endif /* CONFIG_X86_64 */ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) { struct fpstate *guest_fps = guest_fpu->fpstate; struct fpu *fpu = ¤t->thread.fpu; struct fpstate *cur_fps = fpu->fpstate; fpregs_lock(); if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); /* Swap fpstate */ if (enter_guest) { fpu->__task_fpstate = cur_fps; fpu->fpstate = guest_fps; guest_fps->in_use = true; } else { guest_fps->in_use = false; fpu->fpstate = fpu->__task_fpstate; fpu->__task_fpstate = NULL; } cur_fps = fpu->fpstate; if (!cur_fps->is_confidential) { /* Includes XFD update */ restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE); } else { /* * XSTATE is restored by firmware from encrypted * memory. Make sure XFD state is correct while * running with guest fpstate */ xfd_update_state(cur_fps); } fpregs_mark_activate(); fpregs_unlock(); return 0; } EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u64 xfeatures, u32 pkru) { struct fpstate *kstate = gfpu->fpstate; union fpregs_state *ustate = buf; struct membuf mb = { .p = buf, .left = size }; if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru, XSTATE_COPY_XSAVE); } else { memcpy(&ustate->fxsave, &kstate->regs.fxsave, sizeof(ustate->fxsave)); /* Make it restorable on a XSAVE enabled host */ ustate->xsave.header.xfeatures = XFEATURE_MASK_FPSSE; } } EXPORT_SYMBOL_GPL(fpu_copy_guest_fpstate_to_uabi); int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru) { struct fpstate *kstate = gfpu->fpstate; const union fpregs_state *ustate = buf; if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) { if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE) return -EINVAL; if (ustate->fxsave.mxcsr & ~mxcsr_feature_mask) return -EINVAL; memcpy(&kstate->regs.fxsave, &ustate->fxsave, sizeof(ustate->fxsave)); return 0; } if (ustate->xsave.header.xfeatures & ~xcr0) return -EINVAL; /* * Nullify @vpkru to preserve its current value if PKRU's bit isn't set * in the header. KVM's odd ABI is to leave PKRU untouched in this * case (all other components are eventually re-initialized). */ if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU)) vpkru = NULL; return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru); } EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate); #endif /* CONFIG_KVM */ void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, true); if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(¤t->thread.fpu); } __cpu_invalidate_fpregs_state(); /* Put sane initial values into the control registers. */ if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) ldmxcsr(MXCSR_DEFAULT); if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) asm volatile ("fninit"); } EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); preempt_enable(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); /* * Sync the FPU register state to current's memory register state when the * current task owns the FPU. The hardware register state is preserved. */ void fpu_sync_fpstate(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); fpregs_lock(); trace_x86_fpu_before_save(fpu); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) save_fpregs_to_fpstate(fpu); trace_x86_fpu_after_save(fpu); fpregs_unlock(); } static inline unsigned int init_fpstate_copy_size(void) { if (!use_xsave()) return fpu_kernel_cfg.default_size; /* XSAVE(S) just needs the legacy and the xstate header part */ return sizeof(init_fpstate.regs.xsave); } static inline void fpstate_init_fxstate(struct fpstate *fpstate) { fpstate->regs.fxsave.cwd = 0x37f; fpstate->regs.fxsave.mxcsr = MXCSR_DEFAULT; } /* * Legacy x87 fpstate state init: */ static inline void fpstate_init_fstate(struct fpstate *fpstate) { fpstate->regs.fsave.cwd = 0xffff037fu; fpstate->regs.fsave.swd = 0xffff0000u; fpstate->regs.fsave.twd = 0xffffffffu; fpstate->regs.fsave.fos = 0xffff0000u; } /* * Used in two places: * 1) Early boot to setup init_fpstate for non XSAVE systems * 2) fpu_init_fpstate_user() which is invoked from KVM */ void fpstate_init_user(struct fpstate *fpstate) { if (!cpu_feature_enabled(X86_FEATURE_FPU)) { fpstate_init_soft(&fpstate->regs.soft); return; } xstate_init_xcomp_bv(&fpstate->regs.xsave, fpstate->xfeatures); if (cpu_feature_enabled(X86_FEATURE_FXSR)) fpstate_init_fxstate(fpstate); else fpstate_init_fstate(fpstate); } static void __fpstate_reset(struct fpstate *fpstate, u64 xfd) { /* Initialize sizes and feature masks */ fpstate->size = fpu_kernel_cfg.default_size; fpstate->user_size = fpu_user_cfg.default_size; fpstate->xfeatures = fpu_kernel_cfg.default_features; fpstate->user_xfeatures = fpu_user_cfg.default_features; fpstate->xfd = xfd; } void fpstate_reset(struct fpu *fpu) { /* Set the fpstate pointer to the default fpstate */ fpu->fpstate = &fpu->__fpstate; __fpstate_reset(fpu->fpstate, init_fpstate.xfd); /* Initialize the permission related info in fpu */ fpu->perm.__state_perm = fpu_kernel_cfg.default_features; fpu->perm.__state_size = fpu_kernel_cfg.default_size; fpu->perm.__user_state_size = fpu_user_cfg.default_size; /* Same defaults for guests */ fpu->guest_perm = fpu->perm; } static inline void fpu_inherit_perms(struct fpu *dst_fpu) { if (fpu_state_size_dynamic()) { struct fpu *src_fpu = ¤t->group_leader->thread.fpu; spin_lock_irq(¤t->sighand->siglock); /* Fork also inherits the permissions of the parent */ dst_fpu->perm = src_fpu->perm; dst_fpu->guest_perm = src_fpu->guest_perm; spin_unlock_irq(¤t->sighand->siglock); } } /* A passed ssp of zero will not cause any update */ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) { #ifdef CONFIG_X86_USER_SHADOW_STACK struct cet_user_state *xstate; /* If ssp update is not needed. */ if (!ssp) return 0; xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, XFEATURE_CET_USER); /* * If there is a non-zero ssp, then 'dst' must be configured with a shadow * stack and the fpu state should be up to date since it was just copied * from the parent in fpu_clone(). So there must be a valid non-init CET * state location in the buffer. */ if (WARN_ON_ONCE(!xstate)) return 1; xstate->user_ssp = (u64)ssp; #endif return 0; } /* Clone current's FPU state on fork */ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, unsigned long ssp) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; fpstate_reset(dst_fpu); if (!cpu_feature_enabled(X86_FEATURE_FPU)) return 0; /* * Enforce reload for user space tasks and prevent kernel threads * from trying to save the FPU registers on context switch. */ set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); /* * No FPU state inheritance for kernel threads and IO * worker threads. */ if (minimal) { /* Clear out the minimal state */ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); return 0; } /* * If a new feature is added, ensure all dynamic features are * caller-saved from here! */ BUILD_BUG_ON(XFEATURE_MASK_USER_DYNAMIC != XFEATURE_MASK_XTILE_DATA); /* * Save the default portion of the current FPU state into the * clone. Assume all dynamic features to be defined as caller- * saved, which enables skipping both the expansion of fpstate * and the copying of any dynamic state. * * Do not use memcpy() when TIF_NEED_FPU_LOAD is set because * copying is not valid when current uses non-default states. */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); save_fpregs_to_fpstate(dst_fpu); fpregs_unlock(); if (!(clone_flags & CLONE_THREAD)) fpu_inherit_perms(dst_fpu); /* * Children never inherit PASID state. * Force it to have its init value: */ if (use_xsave()) dst_fpu->fpstate->regs.xsave.header.xfeatures &= ~XFEATURE_MASK_PASID; /* * Update shadow stack pointer, in case it changed during clone. */ if (update_fpu_shstk(dst, ssp)) return 1; trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); return 0; } /* * Whitelist the FPU register state embedded into task_struct for hardened * usercopy. */ void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); *size = fpu_kernel_cfg.default_size; } /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents * in the fpregs in the eager-FPU case. * * This function can be used in cases where we know that * a state-restore is coming: either an explicit one, * or a reschedule. */ void fpu__drop(struct fpu *fpu) { preempt_disable(); if (fpu == ¤t->thread.fpu) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); fpregs_deactivate(fpu); } trace_x86_fpu_dropped(fpu); preempt_enable(); } /* * Clear FPU registers by setting them up from the init fpstate. * Caller must do fpregs_[un]lock() around it. */ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) { if (use_xsave()) os_xrstor(&init_fpstate, features_mask); else if (use_fxsr()) fxrstor(&init_fpstate.regs.fxsave); else frstor(&init_fpstate.regs.fsave); pkru_write_default(); } /* * Reset current->fpu memory state to the init values. */ static void fpu_reset_fpregs(void) { struct fpu *fpu = ¤t->thread.fpu; fpregs_lock(); __fpu_invalidate_fpregs_state(fpu); /* * This does not change the actual hardware registers. It just * resets the memory image and sets TIF_NEED_FPU_LOAD so a * subsequent return to usermode will reload the registers from the * task's memory image. * * Do not use fpstate_init() here. Just copy init_fpstate which has * the correct content already except for PKRU. * * PKRU handling does not rely on the xstate when restoring for * user space as PKRU is eagerly written in switch_to() and * flush_thread(). */ memcpy(&fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); set_thread_flag(TIF_NEED_FPU_LOAD); fpregs_unlock(); } /* * Reset current's user FPU states to the init states. current's * supervisor states, if any, are not modified by this function. The * caller guarantees that the XSTATE header in memory is intact. */ void fpu__clear_user_states(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { fpu_reset_fpregs(); fpregs_unlock(); return; } /* * Ensure that current's supervisor states are loaded into their * corresponding registers. */ if (xfeatures_mask_supervisor() && !fpregs_state_valid(fpu, smp_processor_id())) os_xrstor_supervisor(fpu->fpstate); /* Reset user states in registers. */ restore_fpregs_from_init_fpstate(XFEATURE_MASK_USER_RESTORE); /* * Now all FPU registers have their desired values. Inform the FPU * state machine that current's FPU registers are in the hardware * registers. The memory image does not need to be updated because * any operation relying on it has to save the registers first when * current's FPU is marked active. */ fpregs_mark_activate(); fpregs_unlock(); } void fpu_flush_thread(void) { fpstate_reset(¤t->thread.fpu); fpu_reset_fpregs(); } /* * Load FPU context before returning to userspace. */ void switch_fpu_return(void) { if (!static_cpu_has(X86_FEATURE_FPU)) return; fpregs_restore_userregs(); } EXPORT_SYMBOL_GPL(switch_fpu_return); void fpregs_lock_and_load(void) { /* * fpregs_lock() only disables preemption (mostly). So modifying state * in an interrupt could screw up some in progress fpregs operation. * Warn about it. */ WARN_ON_ONCE(!irq_fpu_usable()); WARN_ON_ONCE(current->flags & PF_KTHREAD); fpregs_lock(); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); } #ifdef CONFIG_X86_DEBUG_FPU /* * If current FPU state according to its tracking (loaded FPU context on this * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is * loaded on return to userland. */ void fpregs_assert_state_consistent(void) { struct fpu *fpu = ¤t->thread.fpu; if (test_thread_flag(TIF_NEED_FPU_LOAD)) return; WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); } EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); #endif void fpregs_mark_activate(void) { struct fpu *fpu = ¤t->thread.fpu; fpregs_activate(fpu); fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } /* * x87 math exception handling: */ int fpu__exception_code(struct fpu *fpu, int trap_nr) { int err; if (trap_nr == X86_TRAP_MF) { unsigned short cwd, swd; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the * C1 reg you need in case of a stack fault, 0x040 is the stack * fault bit. We should only be taking one exception at a time, * so if this combination doesn't produce any single exception, * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to * fully reproduce the context of the exception. */ if (boot_cpu_has(X86_FEATURE_FXSR)) { cwd = fpu->fpstate->regs.fxsave.cwd; swd = fpu->fpstate->regs.fxsave.swd; } else { cwd = (unsigned short)fpu->fpstate->regs.fsave.cwd; swd = (unsigned short)fpu->fpstate->regs.fsave.swd; } err = swd & ~cwd; } else { /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which * unmasked exception was caught we must mask the exception mask bits * at 0x1f80, and then use these to mask the exception bits at 0x3f. */ unsigned short mxcsr = MXCSR_DEFAULT; if (boot_cpu_has(X86_FEATURE_XMM)) mxcsr = fpu->fpstate->regs.fxsave.mxcsr; err = ~(mxcsr >> 7) & mxcsr; } if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ return FPE_FLTINV; } else if (err & 0x004) { /* Divide by Zero */ return FPE_FLTDIV; } else if (err & 0x008) { /* Overflow */ return FPE_FLTOVF; } else if (err & 0x012) { /* Denormal, Underflow */ return FPE_FLTUND; } else if (err & 0x020) { /* Precision */ return FPE_FLTRES; } /* * If we're using IRQ 13, or supposedly even some trap * X86_TRAP_MF implementations, it's possible * we get a spurious trap, which is not an error. */ return 0; } /* * Initialize register state that may prevent from entering low-power idle. * This function will be invoked from the cpuidle driver only when needed. */ noinstr void fpu_idle_fpregs(void) { /* Note: AMX_TILE being enabled implies XGETBV1 support */ if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) && (xfeatures_in_use() & XFEATURE_MASK_XTILE)) { tile_release(); __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } } |
37 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Symmetric key ciphers. * * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_SKCIPHER_H #define _CRYPTO_INTERNAL_SKCIPHER_H #include <crypto/algapi.h> #include <crypto/internal/cipher.h> #include <crypto/skcipher.h> #include <linux/list.h> #include <linux/types.h> /* * Set this if your algorithm is sync but needs a reqsize larger * than MAX_SYNC_SKCIPHER_REQSIZE. * * Reuse bit that is specific to hash algorithms. */ #define CRYPTO_ALG_SKCIPHER_REQSIZE_LARGE CRYPTO_ALG_OPTIONAL_KEY struct aead_request; struct rtattr; struct skcipher_instance { void (*free)(struct skcipher_instance *inst); union { struct { char head[offsetof(struct skcipher_alg, base)]; struct crypto_instance base; } s; struct skcipher_alg alg; }; }; struct lskcipher_instance { void (*free)(struct lskcipher_instance *inst); union { struct { char head[offsetof(struct lskcipher_alg, co.base)]; struct crypto_instance base; } s; struct lskcipher_alg alg; }; }; struct crypto_skcipher_spawn { struct crypto_spawn base; }; struct crypto_lskcipher_spawn { struct crypto_spawn base; }; struct skcipher_walk { union { struct { struct page *page; unsigned long offset; } phys; struct { u8 *page; void *addr; } virt; } src, dst; struct scatter_walk in; unsigned int nbytes; struct scatter_walk out; unsigned int total; struct list_head buffers; u8 *page; u8 *buffer; u8 *oiv; void *iv; unsigned int ivsize; int flags; unsigned int blocksize; unsigned int stride; unsigned int alignmask; }; static inline struct crypto_instance *skcipher_crypto_instance( struct skcipher_instance *inst) { return &inst->s.base; } static inline struct crypto_instance *lskcipher_crypto_instance( struct lskcipher_instance *inst) { return &inst->s.base; } static inline struct skcipher_instance *skcipher_alg_instance( struct crypto_skcipher *skcipher) { return container_of(crypto_skcipher_alg(skcipher), struct skcipher_instance, alg); } static inline struct lskcipher_instance *lskcipher_alg_instance( struct crypto_lskcipher *lskcipher) { return container_of(crypto_lskcipher_alg(lskcipher), struct lskcipher_instance, alg); } static inline void *skcipher_instance_ctx(struct skcipher_instance *inst) { return crypto_instance_ctx(skcipher_crypto_instance(inst)); } static inline void *lskcipher_instance_ctx(struct lskcipher_instance *inst) { return crypto_instance_ctx(lskcipher_crypto_instance(inst)); } static inline void skcipher_request_complete(struct skcipher_request *req, int err) { crypto_request_complete(&req->base, err); } int crypto_grab_skcipher(struct crypto_skcipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); int crypto_grab_lskcipher(struct crypto_lskcipher_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); static inline void crypto_drop_skcipher(struct crypto_skcipher_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline void crypto_drop_lskcipher(struct crypto_lskcipher_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct lskcipher_alg *crypto_lskcipher_spawn_alg( struct crypto_lskcipher_spawn *spawn) { return container_of(spawn->base.alg, struct lskcipher_alg, co.base); } static inline struct skcipher_alg_common *crypto_spawn_skcipher_alg_common( struct crypto_skcipher_spawn *spawn) { return container_of(spawn->base.alg, struct skcipher_alg_common, base); } static inline struct lskcipher_alg *crypto_spawn_lskcipher_alg( struct crypto_lskcipher_spawn *spawn) { return crypto_lskcipher_spawn_alg(spawn); } static inline struct crypto_skcipher *crypto_spawn_skcipher( struct crypto_skcipher_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline struct crypto_lskcipher *crypto_spawn_lskcipher( struct crypto_lskcipher_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline void crypto_skcipher_set_reqsize( struct crypto_skcipher *skcipher, unsigned int reqsize) { skcipher->reqsize = reqsize; } static inline void crypto_skcipher_set_reqsize_dma( struct crypto_skcipher *skcipher, unsigned int reqsize) { reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1); skcipher->reqsize = reqsize; } int crypto_register_skcipher(struct skcipher_alg *alg); void crypto_unregister_skcipher(struct skcipher_alg *alg); int crypto_register_skciphers(struct skcipher_alg *algs, int count); void crypto_unregister_skciphers(struct skcipher_alg *algs, int count); int skcipher_register_instance(struct crypto_template *tmpl, struct skcipher_instance *inst); int crypto_register_lskcipher(struct lskcipher_alg *alg); void crypto_unregister_lskcipher(struct lskcipher_alg *alg); int crypto_register_lskciphers(struct lskcipher_alg *algs, int count); void crypto_unregister_lskciphers(struct lskcipher_alg *algs, int count); int lskcipher_register_instance(struct crypto_template *tmpl, struct lskcipher_instance *inst); int skcipher_walk_done(struct skcipher_walk *walk, int err); int skcipher_walk_virt(struct skcipher_walk *walk, struct skcipher_request *req, bool atomic); int skcipher_walk_async(struct skcipher_walk *walk, struct skcipher_request *req); int skcipher_walk_aead_encrypt(struct skcipher_walk *walk, struct aead_request *req, bool atomic); int skcipher_walk_aead_decrypt(struct skcipher_walk *walk, struct aead_request *req, bool atomic); void skcipher_walk_complete(struct skcipher_walk *walk, int err); static inline void skcipher_walk_abort(struct skcipher_walk *walk) { skcipher_walk_done(walk, -ECANCELED); } static inline void *crypto_skcipher_ctx(struct crypto_skcipher *tfm) { return crypto_tfm_ctx(&tfm->base); } static inline void *crypto_lskcipher_ctx(struct crypto_lskcipher *tfm) { return crypto_tfm_ctx(&tfm->base); } static inline void *crypto_skcipher_ctx_dma(struct crypto_skcipher *tfm) { return crypto_tfm_ctx_dma(&tfm->base); } static inline void *skcipher_request_ctx(struct skcipher_request *req) { return req->__ctx; } static inline void *skcipher_request_ctx_dma(struct skcipher_request *req) { unsigned int align = crypto_dma_align(); if (align <= crypto_tfm_ctx_alignment()) align = 1; return PTR_ALIGN(skcipher_request_ctx(req), align); } static inline u32 skcipher_request_flags(struct skcipher_request *req) { return req->base.flags; } /* Helpers for simple block cipher modes of operation */ struct skcipher_ctx_simple { struct crypto_cipher *cipher; /* underlying block cipher */ }; static inline struct crypto_cipher * skcipher_cipher_simple(struct crypto_skcipher *tfm) { struct skcipher_ctx_simple *ctx = crypto_skcipher_ctx(tfm); return ctx->cipher; } struct skcipher_instance *skcipher_alloc_instance_simple( struct crypto_template *tmpl, struct rtattr **tb); static inline struct crypto_alg *skcipher_ialg_simple( struct skcipher_instance *inst) { struct crypto_cipher_spawn *spawn = skcipher_instance_ctx(inst); return crypto_spawn_cipher_alg(spawn); } static inline struct crypto_lskcipher *lskcipher_cipher_simple( struct crypto_lskcipher *tfm) { struct crypto_lskcipher **ctx = crypto_lskcipher_ctx(tfm); return *ctx; } struct lskcipher_instance *lskcipher_alloc_instance_simple( struct crypto_template *tmpl, struct rtattr **tb); static inline struct lskcipher_alg *lskcipher_ialg_simple( struct lskcipher_instance *inst) { struct crypto_lskcipher_spawn *spawn = lskcipher_instance_ctx(inst); return crypto_lskcipher_spawn_alg(spawn); } #endif /* _CRYPTO_INTERNAL_SKCIPHER_H */ |
22 22 32 11 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | /* * Constant-time equality testing of memory regions. * * Authors: * * James Yonan <james@openvpn.net> * Daniel Borkmann <dborkman@redhat.com> * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * * Copyright(c) 2013 OpenVPN Technologies, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. * The full GNU General Public License is included in this distribution * in the file called LICENSE.GPL. * * BSD LICENSE * * Copyright(c) 2013 OpenVPN Technologies, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * Neither the name of OpenVPN Technologies nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/unaligned.h> #include <crypto/algapi.h> #include <linux/module.h> /* Generic path for arbitrary size */ static inline unsigned long __crypto_memneq_generic(const void *a, const void *b, size_t size) { unsigned long neq = 0; #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) while (size >= sizeof(unsigned long)) { neq |= get_unaligned((unsigned long *)a) ^ get_unaligned((unsigned long *)b); OPTIMIZER_HIDE_VAR(neq); a += sizeof(unsigned long); b += sizeof(unsigned long); size -= sizeof(unsigned long); } #endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ while (size > 0) { neq |= *(unsigned char *)a ^ *(unsigned char *)b; OPTIMIZER_HIDE_VAR(neq); a += 1; b += 1; size -= 1; } return neq; } /* Loop-free fast-path for frequently used 16-byte size */ static inline unsigned long __crypto_memneq_16(const void *a, const void *b) { unsigned long neq = 0; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS if (sizeof(unsigned long) == 8) { neq |= get_unaligned((unsigned long *)a) ^ get_unaligned((unsigned long *)b); OPTIMIZER_HIDE_VAR(neq); neq |= get_unaligned((unsigned long *)(a + 8)) ^ get_unaligned((unsigned long *)(b + 8)); OPTIMIZER_HIDE_VAR(neq); } else if (sizeof(unsigned int) == 4) { neq |= get_unaligned((unsigned int *)a) ^ get_unaligned((unsigned int *)b); OPTIMIZER_HIDE_VAR(neq); neq |= get_unaligned((unsigned int *)(a + 4)) ^ get_unaligned((unsigned int *)(b + 4)); OPTIMIZER_HIDE_VAR(neq); neq |= get_unaligned((unsigned int *)(a + 8)) ^ get_unaligned((unsigned int *)(b + 8)); OPTIMIZER_HIDE_VAR(neq); neq |= get_unaligned((unsigned int *)(a + 12)) ^ get_unaligned((unsigned int *)(b + 12)); OPTIMIZER_HIDE_VAR(neq); } else #endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ { neq |= *(unsigned char *)(a) ^ *(unsigned char *)(b); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+1) ^ *(unsigned char *)(b+1); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+2) ^ *(unsigned char *)(b+2); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+3) ^ *(unsigned char *)(b+3); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+4) ^ *(unsigned char *)(b+4); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+5) ^ *(unsigned char *)(b+5); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+6) ^ *(unsigned char *)(b+6); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+7) ^ *(unsigned char *)(b+7); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+8) ^ *(unsigned char *)(b+8); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+9) ^ *(unsigned char *)(b+9); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+10) ^ *(unsigned char *)(b+10); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+11) ^ *(unsigned char *)(b+11); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+12) ^ *(unsigned char *)(b+12); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+13) ^ *(unsigned char *)(b+13); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+14) ^ *(unsigned char *)(b+14); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+15) ^ *(unsigned char *)(b+15); OPTIMIZER_HIDE_VAR(neq); } return neq; } /* Compare two areas of memory without leaking timing information, * and with special optimizations for common sizes. Users should * not call this function directly, but should instead use * crypto_memneq defined in crypto/algapi.h. */ noinline unsigned long __crypto_memneq(const void *a, const void *b, size_t size) { switch (size) { case 16: return __crypto_memneq_16(a, b); default: return __crypto_memneq_generic(a, b, size); } } EXPORT_SYMBOL(__crypto_memneq); |
1 265 6 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner */ #include "bat_v.h" #include "main.h" #include <linux/atomic.h> #include <linux/cache.h> #include <linux/errno.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bat_v_elp.h" #include "bat_v_ogm.h" #include "gateway_client.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "originator.h" static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if) { batadv_v_elp_iface_activate(primary_if, hard_iface); batadv_hardif_put(primary_if); } /* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can * set the interface as ACTIVE right away, without any risk of race * condition */ if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) hard_iface->if_status = BATADV_IF_ACTIVE; } static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface) { int ret; ret = batadv_v_elp_iface_enable(hard_iface); if (ret < 0) return ret; ret = batadv_v_ogm_iface_enable(hard_iface); if (ret < 0) batadv_v_elp_iface_disable(hard_iface); return ret; } static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface) { batadv_v_ogm_iface_disable(hard_iface); batadv_v_elp_iface_disable(hard_iface); } static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface) { batadv_v_elp_primary_iface_set(hard_iface); batadv_v_ogm_primary_iface_set(hard_iface); } /** * batadv_v_iface_update_mac() - react to hard-interface MAC address change * @hard_iface: the modified interface * * If the modified interface is the primary one, update the originator * address in the ELP and OGM messages to reflect the new MAC address. */ static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if != hard_iface) goto out; batadv_v_primary_iface_set(hard_iface); out: batadv_hardif_put(primary_if); } static void batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) { ewma_throughput_init(&hardif_neigh->bat_v.throughput); INIT_WORK(&hardif_neigh->bat_v.metric_work, batadv_v_elp_throughput_metric_update); } /** * batadv_v_neigh_dump_neigh() - Dump a neighbour into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @hardif_neigh: Neighbour to dump * * Return: Error code, or 0 on success */ static int batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_hardif_neigh_node *hardif_neigh) { void *hdr; unsigned int last_seen_msecs; u32 throughput; last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); throughput = throughput * 100; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs) || nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_v_neigh_dump_hardif() - Dump the neighbours of a hard interface into * a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @hard_iface: The hard interface to be dumped * @idx_s: Entries to be skipped * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface, int *idx_s) { struct batadv_hardif_neigh_node *hardif_neigh; int idx = 0; hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) { if (idx++ < *idx_s) continue; if (batadv_v_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) { *idx_s = idx - 1; return -EMSGSIZE; } } *idx_s = 0; return 0; } /** * batadv_v_neigh_dump() - Dump the neighbours of a hard interface into a * message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @single_hardif: Limit dumping to this hard interface */ static void batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *single_hardif) { struct batadv_hard_iface *hard_iface; int i_hardif = 0; int i_hardif_s = cb->args[0]; int idx = cb->args[1]; int portid = NETLINK_CB(cb->skb).portid; rcu_read_lock(); if (single_hardif) { if (i_hardif_s == 0) { if (batadv_v_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, single_hardif, &idx) == 0) i_hardif++; } } else { list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (i_hardif++ < i_hardif_s) continue; if (batadv_v_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, hard_iface, &idx)) { i_hardif--; break; } } } rcu_read_unlock(); cb->args[0] = i_hardif; cb->args[1] = idx; } /** * batadv_v_orig_dump_subentry() - Dump an originator subentry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @neigh_node: Single hops neighbour * @best: Is the best originator * * Return: Error code, or 0 on success */ static int batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, bool best) { struct batadv_neigh_ifinfo *n_ifinfo; unsigned int last_seen_msecs; u32 throughput; void *hdr; n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); if (!n_ifinfo) return 0; throughput = n_ifinfo->bat_v.throughput * 100; batadv_neigh_ifinfo_put(n_ifinfo); last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); if (if_outgoing != BATADV_IF_DEFAULT && if_outgoing != neigh_node->if_incoming) return 0; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_v_orig_dump_entry() - Dump an originator entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @sub_s: Number of sub entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, int *sub_s) { struct batadv_neigh_node *neigh_node_best; struct batadv_neigh_node *neigh_node; int sub = 0; bool best; neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); if (!neigh_node_best) goto out; hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { if (sub++ < *sub_s) continue; best = (neigh_node == neigh_node_best); if (batadv_v_orig_dump_subentry(msg, portid, seq, bat_priv, if_outgoing, orig_node, neigh_node, best)) { batadv_neigh_node_put(neigh_node_best); *sub_s = sub - 1; return -EMSGSIZE; } } out: batadv_neigh_node_put(neigh_node_best); *sub_s = 0; return 0; } /** * batadv_v_orig_dump_bucket() - Dump an originator bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @head: Bucket to be dumped * @idx_s: Number of entries to be skipped * @sub: Number of sub entries to be skipped * * Return: Error code, or 0 on success */ static int batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct hlist_head *head, int *idx_s, int *sub) { struct batadv_orig_node *orig_node; int idx = 0; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { if (idx++ < *idx_s) continue; if (batadv_v_orig_dump_entry(msg, portid, seq, bat_priv, if_outgoing, orig_node, sub)) { rcu_read_unlock(); *idx_s = idx - 1; return -EMSGSIZE; } } rcu_read_unlock(); *idx_s = 0; *sub = 0; return 0; } /** * batadv_v_orig_dump() - Dump the originators into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface */ static void batadv_v_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; int bucket = cb->args[0]; int idx = cb->args[1]; int sub = cb->args[2]; int portid = NETLINK_CB(cb->skb).portid; while (bucket < hash->size) { head = &hash->table[bucket]; if (batadv_v_orig_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, bat_priv, if_outgoing, head, &idx, &sub)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; cb->args[2] = sub; } static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; int ret = 0; ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); if (!ifinfo1) goto err_ifinfo1; ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!ifinfo2) goto err_ifinfo2; ret = ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput; batadv_neigh_ifinfo_put(ifinfo2); err_ifinfo2: batadv_neigh_ifinfo_put(ifinfo1); err_ifinfo1: return ret; } static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; u32 threshold; bool ret = false; ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); if (!ifinfo1) goto err_ifinfo1; ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!ifinfo2) goto err_ifinfo2; threshold = ifinfo1->bat_v.throughput / 4; threshold = ifinfo1->bat_v.throughput - threshold; ret = ifinfo2->bat_v.throughput > threshold; batadv_neigh_ifinfo_put(ifinfo2); err_ifinfo2: batadv_neigh_ifinfo_put(ifinfo1); err_ifinfo1: return ret; } /** * batadv_v_init_sel_class() - initialize GW selection class * @bat_priv: the bat priv with all the soft interface information */ static void batadv_v_init_sel_class(struct batadv_priv *bat_priv) { /* set default throughput difference threshold to 5Mbps */ atomic_set(&bat_priv->gw.sel_class, 50); } /** * batadv_v_gw_throughput_get() - retrieve the GW-bandwidth for a given GW * @gw_node: the GW to retrieve the metric for * @bw: the pointer where the metric will be stored. The metric is computed as * the minimum between the GW advertised throughput and the path throughput to * it in the mesh * * Return: 0 on success, -1 on failure */ static int batadv_v_gw_throughput_get(struct batadv_gw_node *gw_node, u32 *bw) { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_orig_node *orig_node; struct batadv_neigh_node *router; int ret = -1; orig_node = gw_node->orig_node; router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router) goto out; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto out; /* the GW metric is computed as the minimum between the path throughput * to reach the GW itself and the advertised bandwidth. * This gives us an approximation of the effective throughput that the * client can expect via this particular GW node */ *bw = router_ifinfo->bat_v.throughput; *bw = min_t(u32, *bw, gw_node->bandwidth_down); ret = 0; out: batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(router_ifinfo); return ret; } /** * batadv_v_gw_get_best_gw_node() - retrieve the best GW node * @bat_priv: the bat priv with all the soft interface information * * Return: the GW node having the best GW-metric, NULL if no GW is known */ static struct batadv_gw_node * batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node, *curr_gw = NULL; u32 max_bw = 0, bw; rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { if (!kref_get_unless_zero(&gw_node->refcount)) continue; if (batadv_v_gw_throughput_get(gw_node, &bw) < 0) goto next; if (curr_gw && bw <= max_bw) goto next; batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); max_bw = bw; next: batadv_gw_node_put(gw_node); } rcu_read_unlock(); return curr_gw; } /** * batadv_v_gw_is_eligible() - check if a originator would be selected as GW * @bat_priv: the bat priv with all the soft interface information * @curr_gw_orig: originator representing the currently selected GW * @orig_node: the originator representing the new candidate * * Return: true if orig_node can be selected as current GW, false otherwise */ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv, struct batadv_orig_node *curr_gw_orig, struct batadv_orig_node *orig_node) { struct batadv_gw_node *curr_gw, *orig_gw = NULL; u32 gw_throughput, orig_throughput, threshold; bool ret = false; threshold = atomic_read(&bat_priv->gw.sel_class); curr_gw = batadv_gw_node_get(bat_priv, curr_gw_orig); if (!curr_gw) { ret = true; goto out; } if (batadv_v_gw_throughput_get(curr_gw, &gw_throughput) < 0) { ret = true; goto out; } orig_gw = batadv_gw_node_get(bat_priv, orig_node); if (!orig_gw) goto out; if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0) goto out; if (orig_throughput < gw_throughput) goto out; if ((orig_throughput - gw_throughput) < threshold) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Restarting gateway selection: better gateway found (throughput curr: %u, throughput new: %u)\n", gw_throughput, orig_throughput); ret = true; out: batadv_gw_node_put(curr_gw); batadv_gw_node_put(orig_gw); return ret; } /** * batadv_v_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; struct batadv_gw_node *curr_gw = NULL; int ret = 0; void *hdr; router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); if (!router) goto out; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); ret = -EMSGSIZE; if (curr_gw == gw_node) { if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) { genlmsg_cancel(msg, hdr); goto out; } } if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, gw_node->orig_node->orig)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, router_ifinfo->bat_v.throughput)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, router->if_incoming->net_dev->name)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, router->if_incoming->net_dev->ifindex)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: batadv_gw_node_put(curr_gw); batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_node_put(router); return ret; } /** * batadv_v_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information */ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv) { int portid = NETLINK_CB(cb->skb).portid; struct batadv_gw_node *gw_node; int idx_skip = cb->args[0]; int idx = 0; spin_lock_bh(&bat_priv->gw.list_lock); cb->seq = bat_priv->gw.generation << 1 | 1; hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; if (batadv_v_gw_dump_entry(msg, portid, cb, bat_priv, gw_node)) { idx_skip = idx - 1; goto unlock; } } idx_skip = idx; unlock: spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } static struct batadv_algo_ops batadv_batman_v __read_mostly = { .name = "BATMAN_V", .iface = { .activate = batadv_v_iface_activate, .enable = batadv_v_iface_enable, .disable = batadv_v_iface_disable, .update_mac = batadv_v_iface_update_mac, .primary_set = batadv_v_primary_iface_set, }, .neigh = { .hardif_init = batadv_v_hardif_neigh_init, .cmp = batadv_v_neigh_cmp, .is_similar_or_better = batadv_v_neigh_is_sob, .dump = batadv_v_neigh_dump, }, .orig = { .dump = batadv_v_orig_dump, }, .gw = { .init_sel_class = batadv_v_init_sel_class, .sel_class_max = U32_MAX, .get_best_gw_node = batadv_v_gw_get_best_gw_node, .is_eligible = batadv_v_gw_is_eligible, .dump = batadv_v_gw_dump, }, }; /** * batadv_v_hardif_init() - initialize the algorithm specific fields in the * hard-interface object * @hard_iface: the hard-interface to initialize */ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) { /* enable link throughput auto-detection by setting the throughput * override to zero */ atomic_set(&hard_iface->bat_v.throughput_override, 0); atomic_set(&hard_iface->bat_v.elp_interval, 500); hard_iface->bat_v.aggr_len = 0; skb_queue_head_init(&hard_iface->bat_v.aggr_list); INIT_DELAYED_WORK(&hard_iface->bat_v.aggr_wq, batadv_v_ogm_aggr_work); } /** * batadv_v_mesh_init() - initialize the B.A.T.M.A.N. V private resources for a * mesh * @bat_priv: the object representing the mesh interface to initialise * * Return: 0 on success or a negative error code otherwise */ int batadv_v_mesh_init(struct batadv_priv *bat_priv) { int ret = 0; ret = batadv_v_ogm_init(bat_priv); if (ret < 0) return ret; return 0; } /** * batadv_v_mesh_free() - free the B.A.T.M.A.N. V private resources for a mesh * @bat_priv: the object representing the mesh interface to free */ void batadv_v_mesh_free(struct batadv_priv *bat_priv) { batadv_v_ogm_free(bat_priv); } /** * batadv_v_init() - B.A.T.M.A.N. V initialization function * * Description: Takes care of initializing all the subcomponents. * It is invoked upon module load only. * * Return: 0 on success or a negative error code otherwise */ int __init batadv_v_init(void) { int ret; /* B.A.T.M.A.N. V echo location protocol packet */ ret = batadv_recv_handler_register(BATADV_ELP, batadv_v_elp_packet_recv); if (ret < 0) return ret; ret = batadv_recv_handler_register(BATADV_OGM2, batadv_v_ogm_packet_recv); if (ret < 0) goto elp_unregister; ret = batadv_algo_register(&batadv_batman_v); if (ret < 0) goto ogm_unregister; return ret; ogm_unregister: batadv_recv_handler_unregister(BATADV_OGM2); elp_unregister: batadv_recv_handler_unregister(BATADV_ELP); return ret; } |
4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | /* * Copyright (c) 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * Copyright (c) 2005-2017 Mellanox Technologies. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef RDMA_CORE_H #define RDMA_CORE_H #include <linux/idr.h> #include <rdma/uverbs_types.h> #include <rdma/uverbs_ioctl.h> #include <rdma/ib_verbs.h> #include <linux/mutex.h> struct ib_uverbs_device; void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason); int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs); /* * Get an ib_uobject that corresponds to the given id from ufile, assuming * the object is from the given type. Lock it to the required access when * applicable. * This function could create (access == NEW), destroy (access == DESTROY) * or unlock (access == READ || access == WRITE) objects if required. * The action will be finalized only when uverbs_finalize_object or * uverbs_finalize_objects are called. */ struct ib_uobject * uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, s64 id, struct uverbs_attr_bundle *attrs); void uverbs_finalize_object(struct ib_uobject *uobj, enum uverbs_obj_access access, bool hw_obj_valid, bool commit, struct uverbs_attr_bundle *attrs); int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx); void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile); void release_ufile_idr_uobject(struct ib_uverbs_file *ufile); struct ib_udata *uverbs_get_cleared_udata(struct uverbs_attr_bundle *attrs); /* * This is the runtime description of the uverbs API, used by the syscall * machinery to validate and dispatch calls. */ /* * Depending on ID the slot pointer in the radix tree points at one of these * structs. */ struct uverbs_api_ioctl_method { int(__rcu *handler)(struct uverbs_attr_bundle *attrs); DECLARE_BITMAP(attr_mandatory, UVERBS_API_ATTR_BKEY_LEN); u16 bundle_size; u8 use_stack:1; u8 driver_method:1; u8 disabled:1; u8 has_udata:1; u8 key_bitmap_len; u8 destroy_bkey; }; struct uverbs_api_write_method { int (*handler)(struct uverbs_attr_bundle *attrs); u8 disabled:1; u8 is_ex:1; u8 has_udata:1; u8 has_resp:1; u8 req_size; u8 resp_size; }; struct uverbs_api_attr { struct uverbs_attr_spec spec; }; struct uverbs_api { /* radix tree contains struct uverbs_api_* pointers */ struct radix_tree_root radix; enum rdma_driver_id driver_id; unsigned int num_write; unsigned int num_write_ex; struct uverbs_api_write_method notsupp_method; const struct uverbs_api_write_method **write_methods; const struct uverbs_api_write_method **write_ex_methods; }; /* * Get an uverbs_api_object that corresponds to the given object_id. * Note: * -ENOMSG means that any object is allowed to match during lookup. */ static inline const struct uverbs_api_object * uapi_get_object(struct uverbs_api *uapi, u16 object_id) { const struct uverbs_api_object *res; if (object_id == UVERBS_IDR_ANY_OBJECT) return ERR_PTR(-ENOMSG); res = radix_tree_lookup(&uapi->radix, uapi_key_obj(object_id)); if (!res) return ERR_PTR(-ENOENT); return res; } char *uapi_key_format(char *S, unsigned int key); struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev); void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev); void uverbs_disassociate_api(struct uverbs_api *uapi); void uverbs_destroy_api(struct uverbs_api *uapi); void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs); void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile); extern const struct uapi_definition uverbs_def_obj_async_fd[]; extern const struct uapi_definition uverbs_def_obj_counters[]; extern const struct uapi_definition uverbs_def_obj_cq[]; extern const struct uapi_definition uverbs_def_obj_device[]; extern const struct uapi_definition uverbs_def_obj_dm[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; extern const struct uapi_definition uverbs_def_obj_qp[]; extern const struct uapi_definition uverbs_def_obj_srq[]; extern const struct uapi_definition uverbs_def_obj_wq[]; extern const struct uapi_definition uverbs_def_write_intf[]; static inline const struct uverbs_api_write_method * uapi_get_method(const struct uverbs_api *uapi, u32 command) { u32 cmd_idx = command & IB_USER_VERBS_CMD_COMMAND_MASK; if (command & ~(u32)(IB_USER_VERBS_CMD_FLAG_EXTENDED | IB_USER_VERBS_CMD_COMMAND_MASK)) return ERR_PTR(-EINVAL); if (command & IB_USER_VERBS_CMD_FLAG_EXTENDED) { if (cmd_idx >= uapi->num_write_ex) return ERR_PTR(-EOPNOTSUPP); return uapi->write_ex_methods[cmd_idx]; } if (cmd_idx >= uapi->num_write) return ERR_PTR(-EOPNOTSUPP); return uapi->write_methods[cmd_idx]; } void uverbs_fill_udata(struct uverbs_attr_bundle *bundle, struct ib_udata *udata, unsigned int attr_in, unsigned int attr_out); #endif /* RDMA_CORE_H */ |
4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 | /* * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <rdma/rdma_user_ioctl.h> #include <rdma/uverbs_ioctl.h> #include "rdma_core.h" #include "uverbs.h" struct bundle_alloc_head { struct_group_tagged(bundle_alloc_head_hdr, hdr, struct bundle_alloc_head *next; ); u8 data[]; }; struct bundle_priv { /* Must be first */ struct bundle_alloc_head_hdr alloc_head; struct bundle_alloc_head *allocated_mem; size_t internal_avail; size_t internal_used; struct radix_tree_root *radix; const struct uverbs_api_ioctl_method *method_elm; void __rcu **radix_slots; unsigned long radix_slots_len; u32 method_key; struct ib_uverbs_attr __user *user_attrs; struct ib_uverbs_attr *uattrs; DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN); DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN); DECLARE_BITMAP(uobj_hw_obj_valid, UVERBS_API_ATTR_BKEY_LEN); /* * Must be last. bundle ends in a flex array which overlaps * internal_buffer. */ struct uverbs_attr_bundle_hdr bundle; u64 internal_buffer[32]; }; /* * Each method has an absolute minimum amount of memory it needs to allocate, * precompute that amount and determine if the onstack memory can be used or * if allocation is need. */ void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs) { struct bundle_priv *pbundle; struct uverbs_attr_bundle *bundle; size_t bundle_size = offsetof(struct bundle_priv, internal_buffer) + sizeof(*bundle->attrs) * method_elm->key_bitmap_len + sizeof(*pbundle->uattrs) * num_attrs; method_elm->use_stack = bundle_size <= sizeof(*pbundle); method_elm->bundle_size = ALIGN(bundle_size + 256, sizeof(*pbundle->internal_buffer)); /* Do not want order-2 allocations for this. */ WARN_ON_ONCE(method_elm->bundle_size > PAGE_SIZE); } /** * _uverbs_alloc() - Quickly allocate memory for use with a bundle * @bundle: The bundle * @size: Number of bytes to allocate * @flags: Allocator flags * * The bundle allocator is intended for allocations that are connected with * processing the system call related to the bundle. The allocated memory is * always freed once the system call completes, and cannot be freed any other * way. * * This tries to use a small pool of pre-allocated memory for performance. */ __malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size, gfp_t flags) { struct bundle_priv *pbundle = container_of(&bundle->hdr, struct bundle_priv, bundle); size_t new_used; void *res; if (check_add_overflow(size, pbundle->internal_used, &new_used)) return ERR_PTR(-EOVERFLOW); if (new_used > pbundle->internal_avail) { struct bundle_alloc_head *buf; buf = kvmalloc(struct_size(buf, data, size), flags); if (!buf) return ERR_PTR(-ENOMEM); buf->next = pbundle->allocated_mem; pbundle->allocated_mem = buf; return buf->data; } res = (void *)pbundle->internal_buffer + pbundle->internal_used; pbundle->internal_used = ALIGN(new_used, sizeof(*pbundle->internal_buffer)); if (want_init_on_alloc(flags)) memset(res, 0, size); return res; } EXPORT_SYMBOL(_uverbs_alloc); static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, u16 len) { if (uattr->len > sizeof_field(struct ib_uverbs_attr, data)) return ib_is_buffer_cleared(u64_to_user_ptr(uattr->data) + len, uattr->len - len); return !memchr_inv((const void *)&uattr->data + len, 0, uattr->len - len); } static int uverbs_set_output(const struct uverbs_attr_bundle *bundle, const struct uverbs_attr *attr) { struct bundle_priv *pbundle = container_of(&bundle->hdr, struct bundle_priv, bundle); u16 flags; flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags | UVERBS_ATTR_F_VALID_OUTPUT; if (put_user(flags, &pbundle->user_attrs[attr->ptr_attr.uattr_idx].flags)) return -EFAULT; return 0; } static int uverbs_process_idrs_array(struct bundle_priv *pbundle, const struct uverbs_api_attr *attr_uapi, struct uverbs_objs_arr_attr *attr, struct ib_uverbs_attr *uattr, u32 attr_bkey) { struct uverbs_attr_bundle *bundle = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); const struct uverbs_attr_spec *spec = &attr_uapi->spec; size_t array_len; u32 *idr_vals; int ret = 0; size_t i; if (uattr->attr_data.reserved) return -EINVAL; if (uattr->len % sizeof(u32)) return -EINVAL; array_len = uattr->len / sizeof(u32); if (array_len < spec->u2.objs_arr.min_len || array_len > spec->u2.objs_arr.max_len) return -EINVAL; attr->uobjects = uverbs_alloc(bundle, array_size(array_len, sizeof(*attr->uobjects))); if (IS_ERR(attr->uobjects)) return PTR_ERR(attr->uobjects); /* * Since idr is 4B and *uobjects is >= 4B, we can use attr->uobjects * to store idrs array and avoid additional memory allocation. The * idrs array is offset to the end of the uobjects array so we will be * able to read idr and replace with a pointer. */ idr_vals = (u32 *)(attr->uobjects + array_len) - array_len; if (uattr->len > sizeof(uattr->data)) { ret = copy_from_user(idr_vals, u64_to_user_ptr(uattr->data), uattr->len); if (ret) return -EFAULT; } else { memcpy(idr_vals, &uattr->data, uattr->len); } for (i = 0; i != array_len; i++) { attr->uobjects[i] = uverbs_get_uobject_from_file( spec->u2.objs_arr.obj_type, spec->u2.objs_arr.access, idr_vals[i], bundle); if (IS_ERR(attr->uobjects[i])) { ret = PTR_ERR(attr->uobjects[i]); break; } } attr->len = i; __set_bit(attr_bkey, pbundle->spec_finalize); return ret; } static void uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, struct uverbs_objs_arr_attr *attr, bool commit, struct uverbs_attr_bundle *attrs) { const struct uverbs_attr_spec *spec = &attr_uapi->spec; size_t i; for (i = 0; i != attr->len; i++) uverbs_finalize_object(attr->uobjects[i], spec->u2.objs_arr.access, false, commit, attrs); } static int uverbs_process_attr(struct bundle_priv *pbundle, const struct uverbs_api_attr *attr_uapi, struct ib_uverbs_attr *uattr, u32 attr_bkey) { const struct uverbs_attr_spec *spec = &attr_uapi->spec; struct uverbs_attr_bundle *bundle = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); struct uverbs_attr *e = &bundle->attrs[attr_bkey]; const struct uverbs_attr_spec *val_spec = spec; struct uverbs_obj_attr *o_attr; switch (spec->type) { case UVERBS_ATTR_TYPE_ENUM_IN: if (uattr->attr_data.enum_data.elem_id >= spec->u.enum_def.num_elems) return -EOPNOTSUPP; if (uattr->attr_data.enum_data.reserved) return -EINVAL; val_spec = &spec->u2.enum_def.ids[uattr->attr_data.enum_data.elem_id]; /* Currently we only support PTR_IN based enums */ if (val_spec->type != UVERBS_ATTR_TYPE_PTR_IN) return -EOPNOTSUPP; e->ptr_attr.enum_id = uattr->attr_data.enum_data.elem_id; fallthrough; case UVERBS_ATTR_TYPE_PTR_IN: /* Ensure that any data provided by userspace beyond the known * struct is zero. Userspace that knows how to use some future * longer struct will fail here if used with an old kernel and * non-zero content, making ABI compat/discovery simpler. */ if (uattr->len > val_spec->u.ptr.len && val_spec->zero_trailing && !uverbs_is_attr_cleared(uattr, val_spec->u.ptr.len)) return -EOPNOTSUPP; fallthrough; case UVERBS_ATTR_TYPE_PTR_OUT: if (uattr->len < val_spec->u.ptr.min_len || (!val_spec->zero_trailing && uattr->len > val_spec->u.ptr.len)) return -EINVAL; if (spec->type != UVERBS_ATTR_TYPE_ENUM_IN && uattr->attr_data.reserved) return -EINVAL; e->ptr_attr.uattr_idx = uattr - pbundle->uattrs; e->ptr_attr.len = uattr->len; if (val_spec->alloc_and_copy && !uverbs_attr_ptr_is_inline(e)) { void *p; p = uverbs_alloc(bundle, uattr->len); if (IS_ERR(p)) return PTR_ERR(p); e->ptr_attr.ptr = p; if (copy_from_user(p, u64_to_user_ptr(uattr->data), uattr->len)) return -EFAULT; } else { e->ptr_attr.data = uattr->data; } break; case UVERBS_ATTR_TYPE_IDR: case UVERBS_ATTR_TYPE_FD: if (uattr->attr_data.reserved) return -EINVAL; if (uattr->len != 0) return -EINVAL; o_attr = &e->obj_attr; o_attr->attr_elm = attr_uapi; /* * The type of uattr->data is u64 for UVERBS_ATTR_TYPE_IDR and * s64 for UVERBS_ATTR_TYPE_FD. We can cast the u64 to s64 * here without caring about truncation as we know that the * IDR implementation today rejects negative IDs */ o_attr->uobject = uverbs_get_uobject_from_file( spec->u.obj.obj_type, spec->u.obj.access, uattr->data_s64, bundle); if (IS_ERR(o_attr->uobject)) return PTR_ERR(o_attr->uobject); __set_bit(attr_bkey, pbundle->uobj_finalize); if (spec->u.obj.access == UVERBS_ACCESS_NEW) { unsigned int uattr_idx = uattr - pbundle->uattrs; s64 id = o_attr->uobject->id; /* Copy the allocated id to the user-space */ if (put_user(id, &pbundle->user_attrs[uattr_idx].data)) return -EFAULT; } break; case UVERBS_ATTR_TYPE_RAW_FD: if (uattr->attr_data.reserved || uattr->len != 0 || uattr->data_s64 < INT_MIN || uattr->data_s64 > INT_MAX) return -EINVAL; /* _uverbs_get_const_signed() is the accessor */ e->ptr_attr.data = uattr->data_s64; break; case UVERBS_ATTR_TYPE_IDRS_ARRAY: return uverbs_process_idrs_array(pbundle, attr_uapi, &e->objs_arr_attr, uattr, attr_bkey); default: return -EOPNOTSUPP; } return 0; } /* * We search the radix tree with the method prefix and now we want to fast * search the suffix bits to get a particular attribute pointer. It is not * totally clear to me if this breaks the radix tree encasulation or not, but * it uses the iter data to determine if the method iter points at the same * chunk that will store the attribute, if so it just derefs it directly. By * construction in most kernel configs the method and attrs will all fit in a * single radix chunk, so in most cases this will have no search. Other cases * this falls back to a full search. */ static void __rcu **uapi_get_attr_for_method(struct bundle_priv *pbundle, u32 attr_key) { void __rcu **slot; if (likely(attr_key < pbundle->radix_slots_len)) { void *entry; slot = pbundle->radix_slots + attr_key; entry = rcu_dereference_raw(*slot); if (likely(!radix_tree_is_internal_node(entry) && entry)) return slot; } return radix_tree_lookup_slot(pbundle->radix, pbundle->method_key | attr_key); } static int uverbs_set_attr(struct bundle_priv *pbundle, struct ib_uverbs_attr *uattr) { u32 attr_key = uapi_key_attr(uattr->attr_id); u32 attr_bkey = uapi_bkey_attr(attr_key); const struct uverbs_api_attr *attr; void __rcu **slot; int ret; slot = uapi_get_attr_for_method(pbundle, attr_key); if (!slot) { /* * Kernel does not support the attribute but user-space says it * is mandatory */ if (uattr->flags & UVERBS_ATTR_F_MANDATORY) return -EPROTONOSUPPORT; return 0; } attr = rcu_dereference_protected(*slot, true); /* Reject duplicate attributes from user-space */ if (test_bit(attr_bkey, pbundle->bundle.attr_present)) return -EINVAL; ret = uverbs_process_attr(pbundle, attr, uattr, attr_bkey); if (ret) return ret; __set_bit(attr_bkey, pbundle->bundle.attr_present); return 0; } static int ib_uverbs_run_method(struct bundle_priv *pbundle, unsigned int num_attrs) { int (*handler)(struct uverbs_attr_bundle *attrs); struct uverbs_attr_bundle *bundle = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs); unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey; unsigned int i; int ret; /* See uverbs_disassociate_api() */ handler = srcu_dereference( pbundle->method_elm->handler, &pbundle->bundle.ufile->device->disassociate_srcu); if (!handler) return -EIO; pbundle->uattrs = uverbs_alloc(bundle, uattrs_size); if (IS_ERR(pbundle->uattrs)) return PTR_ERR(pbundle->uattrs); if (copy_from_user(pbundle->uattrs, pbundle->user_attrs, uattrs_size)) return -EFAULT; for (i = 0; i != num_attrs; i++) { ret = uverbs_set_attr(pbundle, &pbundle->uattrs[i]); if (unlikely(ret)) return ret; } /* User space did not provide all the mandatory attributes */ if (unlikely(!bitmap_subset(pbundle->method_elm->attr_mandatory, pbundle->bundle.attr_present, pbundle->method_elm->key_bitmap_len))) return -EINVAL; if (pbundle->method_elm->has_udata) uverbs_fill_udata(bundle, &pbundle->bundle.driver_udata, UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT); else pbundle->bundle.driver_udata = (struct ib_udata){}; if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) { struct uverbs_obj_attr *destroy_attr = &bundle->attrs[destroy_bkey].obj_attr; ret = uobj_destroy(destroy_attr->uobject, bundle); if (ret) return ret; __clear_bit(destroy_bkey, pbundle->uobj_finalize); ret = handler(bundle); uobj_put_destroy(destroy_attr->uobject); } else { ret = handler(bundle); } /* * Until the drivers are revised to use the bundle directly we have to * assume that the driver wrote to its UHW_OUT and flag userspace * appropriately. */ if (!ret && pbundle->method_elm->has_udata) { const struct uverbs_attr *attr = uverbs_attr_get(bundle, UVERBS_ATTR_UHW_OUT); if (!IS_ERR(attr)) ret = uverbs_set_output(bundle, attr); } /* * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can * not invoke the method because the request is not supported. No * other cases should return this code. */ if (WARN_ON_ONCE(ret == -EPROTONOSUPPORT)) return -EINVAL; return ret; } static void bundle_destroy(struct bundle_priv *pbundle, bool commit) { unsigned int key_bitmap_len = pbundle->method_elm->key_bitmap_len; struct uverbs_attr_bundle *bundle = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); struct bundle_alloc_head *memblock; unsigned int i; /* fast path for simple uobjects */ i = -1; while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len, i + 1)) < key_bitmap_len) { struct uverbs_attr *attr = &bundle->attrs[i]; uverbs_finalize_object( attr->obj_attr.uobject, attr->obj_attr.attr_elm->spec.u.obj.access, test_bit(i, pbundle->uobj_hw_obj_valid), commit, bundle); } i = -1; while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len, i + 1)) < key_bitmap_len) { struct uverbs_attr *attr = &bundle->attrs[i]; const struct uverbs_api_attr *attr_uapi; void __rcu **slot; slot = uapi_get_attr_for_method( pbundle, pbundle->method_key | uapi_bkey_to_key_attr(i)); if (WARN_ON(!slot)) continue; attr_uapi = rcu_dereference_protected(*slot, true); if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { uverbs_free_idrs_array(attr_uapi, &attr->objs_arr_attr, commit, bundle); } } for (memblock = pbundle->allocated_mem; memblock;) { struct bundle_alloc_head *tmp = memblock; memblock = memblock->next; kvfree(tmp); } } static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, struct ib_uverbs_ioctl_hdr *hdr, struct ib_uverbs_attr __user *user_attrs) { const struct uverbs_api_ioctl_method *method_elm; struct uverbs_api *uapi = ufile->device->uapi; struct radix_tree_iter attrs_iter; struct bundle_priv *pbundle; struct bundle_priv onstack; void __rcu **slot; int ret; if (unlikely(hdr->driver_id != uapi->driver_id)) return -EINVAL; slot = radix_tree_iter_lookup( &uapi->radix, &attrs_iter, uapi_key_obj(hdr->object_id) | uapi_key_ioctl_method(hdr->method_id)); if (unlikely(!slot)) return -EPROTONOSUPPORT; method_elm = rcu_dereference_protected(*slot, true); if (!method_elm->use_stack) { pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL); if (!pbundle) return -ENOMEM; pbundle->internal_avail = method_elm->bundle_size - offsetof(struct bundle_priv, internal_buffer); pbundle->alloc_head.next = NULL; pbundle->allocated_mem = container_of(&pbundle->alloc_head, struct bundle_alloc_head, hdr); } else { pbundle = &onstack; pbundle->internal_avail = sizeof(pbundle->internal_buffer); pbundle->allocated_mem = NULL; } /* Space for the pbundle->bundle.attrs flex array */ pbundle->method_elm = method_elm; pbundle->method_key = attrs_iter.index; pbundle->bundle.ufile = ufile; pbundle->bundle.context = NULL; /* only valid if bundle has uobject */ pbundle->radix = &uapi->radix; pbundle->radix_slots = slot; pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter); pbundle->user_attrs = user_attrs; pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len * sizeof(*container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr)->attrs), sizeof(*pbundle->internal_buffer)); memset(pbundle->bundle.attr_present, 0, sizeof(pbundle->bundle.attr_present)); memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize)); memset(pbundle->uobj_hw_obj_valid, 0, sizeof(pbundle->uobj_hw_obj_valid)); ret = ib_uverbs_run_method(pbundle, hdr->num_attrs); bundle_destroy(pbundle, ret == 0); return ret; } long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct ib_uverbs_file *file = filp->private_data; struct ib_uverbs_ioctl_hdr __user *user_hdr = (struct ib_uverbs_ioctl_hdr __user *)arg; struct ib_uverbs_ioctl_hdr hdr; int srcu_key; int err; if (unlikely(cmd != RDMA_VERBS_IOCTL)) return -ENOIOCTLCMD; err = copy_from_user(&hdr, user_hdr, sizeof(hdr)); if (err) return -EFAULT; if (hdr.length > PAGE_SIZE || hdr.length != struct_size(&hdr, attrs, hdr.num_attrs)) return -EINVAL; if (hdr.reserved1 || hdr.reserved2) return -EPROTONOSUPPORT; srcu_key = srcu_read_lock(&file->device->disassociate_srcu); err = ib_uverbs_cmd_verbs(file, &hdr, user_hdr->attrs); srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return err; } int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, size_t idx, u64 allowed_bits) { const struct uverbs_attr *attr; u64 flags; attr = uverbs_attr_get(attrs_bundle, idx); /* Missing attribute means 0 flags */ if (IS_ERR(attr)) { *to = 0; return 0; } /* * New userspace code should use 8 bytes to pass flags, but we * transparently support old userspaces that were using 4 bytes as * well. */ if (attr->ptr_attr.len == 8) flags = attr->ptr_attr.data; else if (attr->ptr_attr.len == 4) flags = *(u32 *)&attr->ptr_attr.data; else return -EINVAL; if (flags & ~allowed_bits) return -EINVAL; *to = flags; return 0; } EXPORT_SYMBOL(uverbs_get_flags64); int uverbs_get_flags32(u32 *to, const struct uverbs_attr_bundle *attrs_bundle, size_t idx, u64 allowed_bits) { u64 flags; int ret; ret = uverbs_get_flags64(&flags, attrs_bundle, idx, allowed_bits); if (ret) return ret; if (flags > U32_MAX) return -EINVAL; *to = flags; return 0; } EXPORT_SYMBOL(uverbs_get_flags32); /* * Fill a ib_udata struct (core or uhw) using the given attribute IDs. * This is primarily used to convert the UVERBS_ATTR_UHW() into the * ib_udata format used by the drivers. */ void uverbs_fill_udata(struct uverbs_attr_bundle *bundle, struct ib_udata *udata, unsigned int attr_in, unsigned int attr_out) { struct bundle_priv *pbundle = container_of(&bundle->hdr, struct bundle_priv, bundle); struct uverbs_attr_bundle *bundle_aux = container_of(&pbundle->bundle, struct uverbs_attr_bundle, hdr); const struct uverbs_attr *in = uverbs_attr_get(bundle_aux, attr_in); const struct uverbs_attr *out = uverbs_attr_get(bundle_aux, attr_out); if (!IS_ERR(in)) { udata->inlen = in->ptr_attr.len; if (uverbs_attr_ptr_is_inline(in)) udata->inbuf = &pbundle->user_attrs[in->ptr_attr.uattr_idx] .data; else udata->inbuf = u64_to_user_ptr(in->ptr_attr.data); } else { udata->inbuf = NULL; udata->inlen = 0; } if (!IS_ERR(out)) { udata->outbuf = u64_to_user_ptr(out->ptr_attr.data); udata->outlen = out->ptr_attr.len; } else { udata->outbuf = NULL; udata->outlen = 0; } } int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx, const void *from, size_t size) { const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); size_t min_size; if (IS_ERR(attr)) return PTR_ERR(attr); min_size = min_t(size_t, attr->ptr_attr.len, size); if (copy_to_user(u64_to_user_ptr(attr->ptr_attr.data), from, min_size)) return -EFAULT; return uverbs_set_output(bundle, attr); } EXPORT_SYMBOL(uverbs_copy_to); /* * This is only used if the caller has directly used copy_to_use to write the * data. It signals to user space that the buffer is filled in. */ int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx) { const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); if (IS_ERR(attr)) return PTR_ERR(attr); return uverbs_set_output(bundle, attr); } int _uverbs_get_const_signed(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, size_t idx, s64 lower_bound, u64 upper_bound, s64 *def_val) { const struct uverbs_attr *attr; attr = uverbs_attr_get(attrs_bundle, idx); if (IS_ERR(attr)) { if ((PTR_ERR(attr) != -ENOENT) || !def_val) return PTR_ERR(attr); *to = *def_val; } else { *to = attr->ptr_attr.data; } if (*to < lower_bound || (*to > 0 && (u64)*to > upper_bound)) return -EINVAL; return 0; } EXPORT_SYMBOL(_uverbs_get_const_signed); int _uverbs_get_const_unsigned(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, size_t idx, u64 upper_bound, u64 *def_val) { const struct uverbs_attr *attr; attr = uverbs_attr_get(attrs_bundle, idx); if (IS_ERR(attr)) { if ((PTR_ERR(attr) != -ENOENT) || !def_val) return PTR_ERR(attr); *to = *def_val; } else { *to = attr->ptr_attr.data; } if (*to > upper_bound) return -EINVAL; return 0; } EXPORT_SYMBOL(_uverbs_get_const_unsigned); int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, size_t idx, const void *from, size_t size) { const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); if (IS_ERR(attr)) return PTR_ERR(attr); if (size < attr->ptr_attr.len) { if (clear_user(u64_to_user_ptr(attr->ptr_attr.data) + size, attr->ptr_attr.len - size)) return -EFAULT; } return uverbs_copy_to(bundle, idx, from, size); } EXPORT_SYMBOL(uverbs_copy_to_struct_or_zero); /* Once called an abort will call through to the type's destroy_hw() */ void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *bundle, u16 idx) { struct bundle_priv *pbundle = container_of(&bundle->hdr, struct bundle_priv, bundle); __set_bit(uapi_bkey_attr(uapi_key_attr(idx)), pbundle->uobj_hw_obj_valid); } EXPORT_SYMBOL(uverbs_finalize_uobj_create); |
1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 | // SPDX-License-Identifier: GPL-2.0+ /* * Support for dynamic clock devices * * Copyright (C) 2010 OMICRON electronics GmbH */ #include <linux/device.h> #include <linux/export.h> #include <linux/file.h> #include <linux/posix-clock.h> #include <linux/slab.h> #include <linux/syscalls.h> #include <linux/uaccess.h> #include "posix-timers.h" /* * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. */ static struct posix_clock *get_posix_clock(struct file *fp) { struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = pccontext->clk; down_read(&clk->rwsem); if (!clk->zombie) return clk; up_read(&clk->rwsem); return NULL; } static void put_posix_clock(struct posix_clock *clk) { up_read(&clk->rwsem); } static ssize_t posix_clock_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) { struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); int err = -EINVAL; if (!clk) return -ENODEV; if (clk->ops.read) err = clk->ops.read(pccontext, fp->f_flags, buf, count); put_posix_clock(clk); return err; } static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) { struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); __poll_t result = 0; if (!clk) return EPOLLERR; if (clk->ops.poll) result = clk->ops.poll(pccontext, fp, wait); put_posix_clock(clk); return result; } static long posix_clock_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); int err = -ENOTTY; if (!clk) return -ENODEV; if (clk->ops.ioctl) err = clk->ops.ioctl(pccontext, cmd, arg); put_posix_clock(clk); return err; } #ifdef CONFIG_COMPAT static long posix_clock_compat_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk = get_posix_clock(fp); int err = -ENOTTY; if (!clk) return -ENODEV; if (clk->ops.ioctl) err = clk->ops.ioctl(pccontext, cmd, arg); put_posix_clock(clk); return err; } #endif static int posix_clock_open(struct inode *inode, struct file *fp) { int err; struct posix_clock *clk = container_of(inode->i_cdev, struct posix_clock, cdev); struct posix_clock_context *pccontext; down_read(&clk->rwsem); if (clk->zombie) { err = -ENODEV; goto out; } pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL); if (!pccontext) { err = -ENOMEM; goto out; } pccontext->clk = clk; if (clk->ops.open) { err = clk->ops.open(pccontext, fp->f_mode); if (err) { kfree(pccontext); goto out; } } fp->private_data = pccontext; get_device(clk->dev); err = 0; out: up_read(&clk->rwsem); return err; } static int posix_clock_release(struct inode *inode, struct file *fp) { struct posix_clock_context *pccontext = fp->private_data; struct posix_clock *clk; int err = 0; if (!pccontext) return -ENODEV; clk = pccontext->clk; if (clk->ops.release) err = clk->ops.release(pccontext); put_device(clk->dev); kfree(pccontext); fp->private_data = NULL; return err; } static const struct file_operations posix_clock_file_operations = { .owner = THIS_MODULE, .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, #ifdef CONFIG_COMPAT .compat_ioctl = posix_clock_compat_ioctl, #endif }; int posix_clock_register(struct posix_clock *clk, struct device *dev) { int err; init_rwsem(&clk->rwsem); cdev_init(&clk->cdev, &posix_clock_file_operations); err = cdev_device_add(&clk->cdev, dev); if (err) { pr_err("%s unable to add device %d:%d\n", dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt)); return err; } clk->cdev.owner = clk->ops.owner; clk->dev = dev; return 0; } EXPORT_SYMBOL_GPL(posix_clock_register); void posix_clock_unregister(struct posix_clock *clk) { cdev_device_del(&clk->cdev, clk->dev); down_write(&clk->rwsem); clk->zombie = true; up_write(&clk->rwsem); put_device(clk->dev); } EXPORT_SYMBOL_GPL(posix_clock_unregister); struct posix_clock_desc { struct file *fp; struct posix_clock *clk; }; static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) { struct file *fp = fget(clockid_to_fd(id)); int err = -EINVAL; if (!fp) return err; if (fp->f_op->open != posix_clock_open || !fp->private_data) goto out; cd->fp = fp; cd->clk = get_posix_clock(fp); err = cd->clk ? 0 : -ENODEV; out: if (err) fput(fp); return err; } static void put_clock_desc(struct posix_clock_desc *cd) { put_posix_clock(cd->clk); fput(cd->fp); } static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) { struct posix_clock_desc cd; int err; err = get_clock_desc(id, &cd); if (err) return err; if ((cd.fp->f_mode & FMODE_WRITE) == 0) { err = -EACCES; goto out; } if (cd.clk->ops.clock_adjtime) err = cd.clk->ops.clock_adjtime(cd.clk, tx); else err = -EOPNOTSUPP; out: put_clock_desc(&cd); return err; } static int pc_clock_gettime(clockid_t id, struct timespec64 *ts) { struct posix_clock_desc cd; int err; err = get_clock_desc(id, &cd); if (err) return err; if (cd.clk->ops.clock_gettime) err = cd.clk->ops.clock_gettime(cd.clk, ts); else err = -EOPNOTSUPP; put_clock_desc(&cd); return err; } static int pc_clock_getres(clockid_t id, struct timespec64 *ts) { struct posix_clock_desc cd; int err; err = get_clock_desc(id, &cd); if (err) return err; if (cd.clk->ops.clock_getres) err = cd.clk->ops.clock_getres(cd.clk, ts); else err = -EOPNOTSUPP; put_clock_desc(&cd); return err; } static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) { struct posix_clock_desc cd; int err; err = get_clock_desc(id, &cd); if (err) return err; if ((cd.fp->f_mode & FMODE_WRITE) == 0) { err = -EACCES; goto out; } if (!timespec64_valid_strict(ts)) return -EINVAL; if (cd.clk->ops.clock_settime) err = cd.clk->ops.clock_settime(cd.clk, ts); else err = -EOPNOTSUPP; out: put_clock_desc(&cd); return err; } const struct k_clock clock_posix_dynamic = { .clock_getres = pc_clock_getres, .clock_set = pc_clock_settime, .clock_get_timespec = pc_clock_gettime, .clock_adj = pc_clock_adjtime, }; |
1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 | // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/syscalls.h> #include <linux/time_namespace.h> #include "futex.h" /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. * * Implementation: user-space maintains a per-thread list of locks it * is holding. Upon do_exit(), the kernel carefully walks this list, * and marks all locks that are owned by this thread with the * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is * always manipulated with the lock held, so the list is private and * per-thread. Userspace also maintains a per-thread 'list_op_pending' * field, to allow the kernel to clean up if the thread dies after * acquiring the lock, but just before it could have added itself to * the list. There can only be one such pending lock. */ /** * sys_set_robust_list() - Set the robust-futex list head of a task * @head: pointer to the list-head * @len: length of the list-head, as userspace expects */ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) { /* * The kernel knows only one size for now: */ if (unlikely(len != sizeof(*head))) return -EINVAL; current->robust_list = head; return 0; } /** * sys_get_robust_list() - Get the robust-futex list head of a task * @pid: pid of the process [zero for current task] * @head_ptr: pointer to a list-head pointer, the kernel fills it in * @len_ptr: pointer to a length field, the kernel fills in the header size */ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr, size_t __user *, len_ptr) { struct robust_list_head __user *head; unsigned long ret; struct task_struct *p; rcu_read_lock(); ret = -ESRCH; if (!pid) p = current; else { p = find_task_by_vpid(pid); if (!p) goto err_unlock; } ret = -EPERM; if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->robust_list; rcu_read_unlock(); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(head, head_ptr); err_unlock: rcu_read_unlock(); return ret; } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { unsigned int flags = futex_to_flags(op); int cmd = op & FUTEX_CMD_MASK; if (flags & FLAGS_CLOCKRT) { if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI && cmd != FUTEX_LOCK_PI2) return -ENOSYS; } switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; case FUTEX_WAIT_BITSET: return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; case FUTEX_WAKE_BITSET: return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0); case FUTEX_CMP_REQUEUE: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0); case FUTEX_WAKE_OP: return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: flags |= FLAGS_CLOCKRT; fallthrough; case FUTEX_LOCK_PI2: return futex_lock_pi(uaddr, flags, timeout, 0); case FUTEX_UNLOCK_PI: return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: return futex_lock_pi(uaddr, flags, NULL, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, uaddr2); case FUTEX_CMP_REQUEUE_PI: return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1); } return -ENOSYS; } static __always_inline bool futex_cmd_has_timeout(u32 cmd) { switch (cmd) { case FUTEX_WAIT: case FUTEX_LOCK_PI: case FUTEX_LOCK_PI2: case FUTEX_WAIT_BITSET: case FUTEX_WAIT_REQUEUE_PI: return true; } return false; } static __always_inline int futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) { if (!timespec64_valid(ts)) return -EINVAL; *t = timespec64_to_ktime(*ts); if (cmd == FUTEX_WAIT) *t = ktime_add_safe(ktime_get(), *t); else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); return 0; } SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) { int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; struct timespec64 ts; if (utime && futex_cmd_has_timeout(cmd)) { if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) return -EFAULT; if (get_timespec64(&ts, utime)) return -EFAULT; ret = futex_init_timeout(cmd, op, &ts, &t); if (ret) return ret; tp = &t; } return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } /** * futex_parse_waitv - Parse a waitv array from userspace * @futexv: Kernel side list of waiters to be filled * @uwaitv: Userspace list to be parsed * @nr_futexes: Length of futexv * @wake: Wake to call when futex is woken * @wake_data: Data for the wake handler * * Return: Error code on failure, 0 on success */ int futex_parse_waitv(struct futex_vector *futexv, struct futex_waitv __user *uwaitv, unsigned int nr_futexes, futex_wake_fn *wake, void *wake_data) { struct futex_waitv aux; unsigned int i; for (i = 0; i < nr_futexes; i++) { unsigned int flags; if (copy_from_user(&aux, &uwaitv[i], sizeof(aux))) return -EFAULT; if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved) return -EINVAL; flags = futex2_to_flags(aux.flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, aux.val)) return -EINVAL; futexv[i].w.flags = flags; futexv[i].w.val = aux.val; futexv[i].w.uaddr = aux.uaddr; futexv[i].q = futex_q_init; futexv[i].q.wake = wake; futexv[i].q.wake_data = wake_data; } return 0; } static int futex2_setup_timeout(struct __kernel_timespec __user *timeout, clockid_t clockid, struct hrtimer_sleeper *to) { int flag_clkid = 0, flag_init = 0; struct timespec64 ts; ktime_t time; int ret; if (!timeout) return 0; if (clockid == CLOCK_REALTIME) { flag_clkid = FLAGS_CLOCKRT; flag_init = FUTEX_CLOCK_REALTIME; } if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) return -EINVAL; if (get_timespec64(&ts, timeout)) return -EFAULT; /* * Since there's no opcode for futex_waitv, use * FUTEX_WAIT_BITSET that uses absolute timeout as well */ ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time); if (ret) return ret; futex_setup_timer(&time, to, flag_clkid, 0); return 0; } static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to) { hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); } /** * sys_futex_waitv - Wait on a list of futexes * @waiters: List of futexes to wait on * @nr_futexes: Length of futexv * @flags: Flag for timeout (monotonic/realtime) * @timeout: Optional absolute timeout. * @clockid: Clock to be used for the timeout, realtime or monotonic. * * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes * if a futex_wake() is performed at any uaddr. The syscall returns immediately * if any waiter has *uaddr != val. *timeout is an optional timeout value for * the operation. Each waiter has individual flags. The `flags` argument for * the syscall should be used solely for specifying the timeout as realtime, if * needed. Flags for private futexes, sizes, etc. should be used on the * individual flags of each waiter. * * Returns the array index of one of the woken futexes. No further information * is provided: any number of other futexes may also have been woken by the * same event, and if more than one futex was woken, the retrned index may * refer to any one of them. (It is not necessaryily the futex with the * smallest index, nor the one most recently woken, nor...) */ SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters, unsigned int, nr_futexes, unsigned int, flags, struct __kernel_timespec __user *, timeout, clockid_t, clockid) { struct hrtimer_sleeper to; struct futex_vector *futexv; int ret; /* This syscall supports no flags for now */ if (flags) return -EINVAL; if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters) return -EINVAL; if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) return ret; futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL); if (!futexv) { ret = -ENOMEM; goto destroy_timer; } ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark, NULL); if (!ret) ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL); kfree(futexv); destroy_timer: if (timeout) futex2_destroy_timeout(&to); return ret; } /* * sys_futex_wake - Wake a number of futexes * @uaddr: Address of the futex(es) to wake * @mask: bitmask * @nr: Number of the futexes to wake * @flags: FUTEX2 flags * * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the * futex2 family of calls. */ SYSCALL_DEFINE4(futex_wake, void __user *, uaddr, unsigned long, mask, int, nr, unsigned int, flags) { if (flags & ~FUTEX2_VALID_MASK) return -EINVAL; flags = futex2_to_flags(flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, mask)) return -EINVAL; return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask); } /* * sys_futex_wait - Wait on a futex * @uaddr: Address of the futex to wait on * @val: Value of @uaddr * @mask: bitmask * @flags: FUTEX2 flags * @timeout: Optional absolute timeout * @clockid: Clock to be used for the timeout, realtime or monotonic * * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the * futex2 familiy of calls. */ SYSCALL_DEFINE6(futex_wait, void __user *, uaddr, unsigned long, val, unsigned long, mask, unsigned int, flags, struct __kernel_timespec __user *, timeout, clockid_t, clockid) { struct hrtimer_sleeper to; int ret; if (flags & ~FUTEX2_VALID_MASK) return -EINVAL; flags = futex2_to_flags(flags); if (!futex_flags_valid(flags)) return -EINVAL; if (!futex_validate_input(flags, val) || !futex_validate_input(flags, mask)) return -EINVAL; if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to))) return ret; ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask); if (timeout) futex2_destroy_timeout(&to); return ret; } /* * sys_futex_requeue - Requeue a waiter from one futex to another * @waiters: array describing the source and destination futex * @flags: unused * @nr_wake: number of futexes to wake * @nr_requeue: number of futexes to requeue * * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the * futex2 family of calls. */ SYSCALL_DEFINE4(futex_requeue, struct futex_waitv __user *, waiters, unsigned int, flags, int, nr_wake, int, nr_requeue) { struct futex_vector futexes[2]; u32 cmpval; int ret; if (flags) return -EINVAL; if (!waiters) return -EINVAL; ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL); if (ret) return ret; cmpval = futexes[0].w.val; return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags, u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags, nr_wake, nr_requeue, &cmpval, 0); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, compat_size_t, len) { if (unlikely(len != sizeof(*head))) return -EINVAL; current->compat_robust_list = head; return 0; } COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, compat_uptr_t __user *, head_ptr, compat_size_t __user *, len_ptr) { struct compat_robust_list_head __user *head; unsigned long ret; struct task_struct *p; rcu_read_lock(); ret = -ESRCH; if (!pid) p = current; else { p = find_task_by_vpid(pid); if (!p) goto err_unlock; } ret = -EPERM; if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->compat_robust_list; rcu_read_unlock(); if (put_user(sizeof(*head), len_ptr)) return -EFAULT; return put_user(ptr_to_compat(head), head_ptr); err_unlock: rcu_read_unlock(); return ret; } #endif /* CONFIG_COMPAT */ #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; struct timespec64 ts; if (utime && futex_cmd_has_timeout(cmd)) { if (get_old_timespec32(&ts, utime)) return -EFAULT; ret = futex_init_timeout(cmd, op, &ts, &t); if (ret) return ret; tp = &t; } return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */ |
10 10 2 2 21 21 3 19 1 1 1 1 2 2 2 2 2 6 1 1 1 1 1 2 3 848 840 10 7 3 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007, 2008, 2009 Siemens AG */ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/device.h> #include <net/cfg802154.h> #include <net/rtnetlink.h> #include "ieee802154.h" #include "nl802154.h" #include "sysfs.h" #include "core.h" /* name for sysfs, %d is appended */ #define PHY_NAME "phy" /* RCU-protected (and RTNL for writers) */ LIST_HEAD(cfg802154_rdev_list); int cfg802154_rdev_list_generation; struct wpan_phy *wpan_phy_find(const char *str) { struct device *dev; if (WARN_ON(!str)) return NULL; dev = class_find_device_by_name(&wpan_phy_class, str); if (!dev) return NULL; return container_of(dev, struct wpan_phy, dev); } EXPORT_SYMBOL(wpan_phy_find); struct wpan_phy_iter_data { int (*fn)(struct wpan_phy *phy, void *data); void *data; }; static int wpan_phy_iter(struct device *dev, void *_data) { struct wpan_phy_iter_data *wpid = _data; struct wpan_phy *phy = container_of(dev, struct wpan_phy, dev); return wpid->fn(phy, wpid->data); } int wpan_phy_for_each(int (*fn)(struct wpan_phy *phy, void *data), void *data) { struct wpan_phy_iter_data wpid = { .fn = fn, .data = data, }; return class_for_each_device(&wpan_phy_class, NULL, &wpid, wpan_phy_iter); } EXPORT_SYMBOL(wpan_phy_for_each); struct cfg802154_registered_device * cfg802154_rdev_by_wpan_phy_idx(int wpan_phy_idx) { struct cfg802154_registered_device *result = NULL, *rdev; ASSERT_RTNL(); list_for_each_entry(rdev, &cfg802154_rdev_list, list) { if (rdev->wpan_phy_idx == wpan_phy_idx) { result = rdev; break; } } return result; } struct wpan_phy *wpan_phy_idx_to_wpan_phy(int wpan_phy_idx) { struct cfg802154_registered_device *rdev; ASSERT_RTNL(); rdev = cfg802154_rdev_by_wpan_phy_idx(wpan_phy_idx); if (!rdev) return NULL; return &rdev->wpan_phy; } struct wpan_phy * wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) { static atomic_t wpan_phy_counter = ATOMIC_INIT(0); struct cfg802154_registered_device *rdev; size_t alloc_size; alloc_size = sizeof(*rdev) + priv_size; rdev = kzalloc(alloc_size, GFP_KERNEL); if (!rdev) return NULL; rdev->ops = ops; rdev->wpan_phy_idx = atomic_inc_return(&wpan_phy_counter); if (unlikely(rdev->wpan_phy_idx < 0)) { /* ugh, wrapped! */ atomic_dec(&wpan_phy_counter); kfree(rdev); return NULL; } /* atomic_inc_return makes it start at 1, make it start at 0 */ rdev->wpan_phy_idx--; INIT_LIST_HEAD(&rdev->wpan_dev_list); device_initialize(&rdev->wpan_phy.dev); dev_set_name(&rdev->wpan_phy.dev, PHY_NAME "%d", rdev->wpan_phy_idx); rdev->wpan_phy.dev.class = &wpan_phy_class; rdev->wpan_phy.dev.platform_data = rdev; wpan_phy_net_set(&rdev->wpan_phy, &init_net); init_waitqueue_head(&rdev->dev_wait); init_waitqueue_head(&rdev->wpan_phy.sync_txq); spin_lock_init(&rdev->wpan_phy.queue_lock); return &rdev->wpan_phy; } EXPORT_SYMBOL(wpan_phy_new); int wpan_phy_register(struct wpan_phy *phy) { struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(phy); int ret; rtnl_lock(); ret = device_add(&phy->dev); if (ret) { rtnl_unlock(); return ret; } list_add_rcu(&rdev->list, &cfg802154_rdev_list); cfg802154_rdev_list_generation++; /* TODO phy registered lock */ rtnl_unlock(); /* TODO nl802154 phy notify */ return 0; } EXPORT_SYMBOL(wpan_phy_register); void wpan_phy_unregister(struct wpan_phy *phy) { struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(phy); wait_event(rdev->dev_wait, ({ int __count; rtnl_lock(); __count = rdev->opencount; rtnl_unlock(); __count == 0; })); rtnl_lock(); /* TODO nl802154 phy notify */ /* TODO phy registered lock */ WARN_ON(!list_empty(&rdev->wpan_dev_list)); /* First remove the hardware from everywhere, this makes * it impossible to find from userspace. */ list_del_rcu(&rdev->list); synchronize_rcu(); cfg802154_rdev_list_generation++; device_del(&phy->dev); rtnl_unlock(); } EXPORT_SYMBOL(wpan_phy_unregister); void wpan_phy_free(struct wpan_phy *phy) { put_device(&phy->dev); } EXPORT_SYMBOL(wpan_phy_free); static void cfg802154_free_peer_structures(struct wpan_dev *wpan_dev) { struct ieee802154_pan_device *child, *tmp; mutex_lock(&wpan_dev->association_lock); kfree(wpan_dev->parent); wpan_dev->parent = NULL; list_for_each_entry_safe(child, tmp, &wpan_dev->children, node) { list_del(&child->node); kfree(child); } wpan_dev->nchildren = 0; mutex_unlock(&wpan_dev->association_lock); } int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, struct net *net) { struct wpan_dev *wpan_dev; int err = 0; list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { if (!wpan_dev->netdev) continue; wpan_dev->netdev->netns_local = false; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); if (err) break; wpan_dev->netdev->netns_local = true; } if (err) { /* failed -- clean up to old netns */ net = wpan_phy_net(&rdev->wpan_phy); list_for_each_entry_continue_reverse(wpan_dev, &rdev->wpan_dev_list, list) { if (!wpan_dev->netdev) continue; wpan_dev->netdev->netns_local = false; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); WARN_ON(err); wpan_dev->netdev->netns_local = true; } return err; } wpan_phy_net_set(&rdev->wpan_phy, net); err = device_rename(&rdev->wpan_phy.dev, dev_name(&rdev->wpan_phy.dev)); WARN_ON(err); return 0; } void cfg802154_dev_free(struct cfg802154_registered_device *rdev) { kfree(rdev); } static void cfg802154_update_iface_num(struct cfg802154_registered_device *rdev, int iftype, int num) { ASSERT_RTNL(); rdev->num_running_ifaces += num; } static int cfg802154_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct cfg802154_registered_device *rdev; if (!wpan_dev) return NOTIFY_DONE; rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy); /* TODO WARN_ON unspec type */ switch (state) { /* TODO NETDEV_DEVTYPE */ case NETDEV_REGISTER: dev->netns_local = true; wpan_dev->identifier = ++rdev->wpan_dev_id; list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list); rdev->devlist_generation++; mutex_init(&wpan_dev->association_lock); INIT_LIST_HEAD(&wpan_dev->children); wpan_dev->max_associations = SZ_16K; wpan_dev->netdev = dev; break; case NETDEV_DOWN: cfg802154_update_iface_num(rdev, wpan_dev->iftype, -1); rdev->opencount--; wake_up(&rdev->dev_wait); break; case NETDEV_UP: cfg802154_update_iface_num(rdev, wpan_dev->iftype, 1); rdev->opencount++; break; case NETDEV_UNREGISTER: cfg802154_free_peer_structures(wpan_dev); /* It is possible to get NETDEV_UNREGISTER * multiple times. To detect that, check * that the interface is still on the list * of registered interfaces, and only then * remove and clean it up. */ if (!list_empty(&wpan_dev->list)) { list_del_rcu(&wpan_dev->list); rdev->devlist_generation++; } /* synchronize (so that we won't find this netdev * from other code any more) and then clear the list * head so that the above code can safely check for * !list_empty() to avoid double-cleanup. */ synchronize_rcu(); INIT_LIST_HEAD(&wpan_dev->list); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static struct notifier_block cfg802154_netdev_notifier = { .notifier_call = cfg802154_netdev_notifier_call, }; static void __net_exit cfg802154_pernet_exit(struct net *net) { struct cfg802154_registered_device *rdev; rtnl_lock(); list_for_each_entry(rdev, &cfg802154_rdev_list, list) { if (net_eq(wpan_phy_net(&rdev->wpan_phy), net)) WARN_ON(cfg802154_switch_netns(rdev, &init_net)); } rtnl_unlock(); } static struct pernet_operations cfg802154_pernet_ops = { .exit = cfg802154_pernet_exit, }; static int __init wpan_phy_class_init(void) { int rc; rc = register_pernet_device(&cfg802154_pernet_ops); if (rc) goto err; rc = wpan_phy_sysfs_init(); if (rc) goto err_sysfs; rc = register_netdevice_notifier(&cfg802154_netdev_notifier); if (rc) goto err_nl; rc = ieee802154_nl_init(); if (rc) goto err_notifier; rc = nl802154_init(); if (rc) goto err_ieee802154_nl; return 0; err_ieee802154_nl: ieee802154_nl_exit(); err_notifier: unregister_netdevice_notifier(&cfg802154_netdev_notifier); err_nl: wpan_phy_sysfs_exit(); err_sysfs: unregister_pernet_device(&cfg802154_pernet_ops); err: return rc; } subsys_initcall(wpan_phy_class_init); static void __exit wpan_phy_class_exit(void) { nl802154_exit(); ieee802154_nl_exit(); unregister_netdevice_notifier(&cfg802154_netdev_notifier); wpan_phy_sysfs_exit(); unregister_pernet_device(&cfg802154_pernet_ops); } module_exit(wpan_phy_class_exit); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface"); MODULE_AUTHOR("Dmitry Eremin-Solenikov"); |
4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright 2019 Google LLC */ #ifndef __LINUX_BLK_CRYPTO_INTERNAL_H #define __LINUX_BLK_CRYPTO_INTERNAL_H #include <linux/bio.h> #include <linux/blk-mq.h> /* Represents a crypto mode supported by blk-crypto */ struct blk_crypto_mode { const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ unsigned int ivsize; /* iv size in bytes */ }; extern const struct blk_crypto_mode blk_crypto_modes[]; #ifdef CONFIG_BLK_INLINE_ENCRYPTION int blk_crypto_sysfs_register(struct gendisk *disk); void blk_crypto_sysfs_unregister(struct gendisk *disk); void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc); bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio); bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, struct bio_crypt_ctx *bc2); static inline bool bio_crypt_ctx_back_mergeable(struct request *req, struct bio *bio) { return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), bio->bi_crypt_context); } static inline bool bio_crypt_ctx_front_mergeable(struct request *req, struct bio *bio) { return bio_crypt_ctx_mergeable(bio->bi_crypt_context, bio->bi_iter.bi_size, req->crypt_ctx); } static inline bool bio_crypt_ctx_merge_rq(struct request *req, struct request *next) { return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), next->crypt_ctx); } static inline void blk_crypto_rq_set_defaults(struct request *rq) { rq->crypt_ctx = NULL; rq->crypt_keyslot = NULL; } static inline bool blk_crypto_rq_is_encrypted(struct request *rq) { return rq->crypt_ctx; } static inline bool blk_crypto_rq_has_keyslot(struct request *rq) { return rq->crypt_keyslot; } blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, struct blk_crypto_keyslot **slot_ptr); void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); int __blk_crypto_evict_key(struct blk_crypto_profile *profile, const struct blk_crypto_key *key); bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, const struct blk_crypto_config *cfg); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) { return 0; } static inline void blk_crypto_sysfs_unregister(struct gendisk *disk) { } static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_front_mergeable(struct request *req, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_back_mergeable(struct request *req, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_merge_rq(struct request *req, struct request *next) { return true; } static inline void blk_crypto_rq_set_defaults(struct request *rq) { } static inline bool blk_crypto_rq_is_encrypted(struct request *rq) { return false; } static inline bool blk_crypto_rq_has_keyslot(struct request *rq) { return false; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); static inline void bio_crypt_advance(struct bio *bio, unsigned int bytes) { if (bio_has_crypt_ctx(bio)) __bio_crypt_advance(bio, bytes); } void __bio_crypt_free_ctx(struct bio *bio); static inline void bio_crypt_free_ctx(struct bio *bio) { if (bio_has_crypt_ctx(bio)) __bio_crypt_free_ctx(bio); } static inline void bio_crypt_do_front_merge(struct request *rq, struct bio *bio) { #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (bio_has_crypt_ctx(bio)) memcpy(rq->crypt_ctx->bc_dun, bio->bi_crypt_context->bc_dun, sizeof(rq->crypt_ctx->bc_dun)); #endif } bool __blk_crypto_bio_prep(struct bio **bio_ptr); static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) { if (bio_has_crypt_ctx(*bio_ptr)) return __blk_crypto_bio_prep(bio_ptr); return true; } blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq); static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) return __blk_crypto_rq_get_keyslot(rq); return BLK_STS_OK; } void __blk_crypto_rq_put_keyslot(struct request *rq); static inline void blk_crypto_rq_put_keyslot(struct request *rq) { if (blk_crypto_rq_has_keyslot(rq)) __blk_crypto_rq_put_keyslot(rq); } void __blk_crypto_free_request(struct request *rq); static inline void blk_crypto_free_request(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) __blk_crypto_free_request(rq); } int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask); /** * blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio * is inserted * @rq: The request to prepare * @bio: The first bio being inserted into the request * @gfp_mask: Memory allocation flags * * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM. */ static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask) { if (bio_has_crypt_ctx(bio)) return __blk_crypto_rq_bio_prep(rq, bio, gfp_mask); return 0; } #ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num); bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr); int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key); #else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ static inline int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) { pr_warn_once("crypto API fallback is disabled\n"); return -ENOPKG; } static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) { pr_warn_once("crypto API fallback disabled; failing request.\n"); (*bio_ptr)->bi_status = BLK_STS_NOTSUPP; return false; } static inline int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { return 0; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ #endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */ |
4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | // SPDX-License-Identifier: GPL-2.0 /* * CPU <-> hardware queue mapping helpers * * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/kernel.h> #include <linux/threads.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/group_cpus.h> #include "blk.h" #include "blk-mq.h" void blk_mq_map_queues(struct blk_mq_queue_map *qmap) { const struct cpumask *masks; unsigned int queue, cpu; masks = group_cpus_evenly(qmap->nr_queues); if (!masks) { for_each_possible_cpu(cpu) qmap->mq_map[cpu] = qmap->queue_offset; return; } for (queue = 0; queue < qmap->nr_queues; queue++) { for_each_cpu(cpu, &masks[queue]) qmap->mq_map[cpu] = qmap->queue_offset + queue; } kfree(masks); } EXPORT_SYMBOL_GPL(blk_mq_map_queues); /** * blk_mq_hw_queue_to_node - Look up the memory node for a hardware queue index * @qmap: CPU to hardware queue map. * @index: hardware queue index. * * We have no quick way of doing reverse lookups. This is only used at * queue init time, so runtime isn't important. */ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index) { int i; for_each_possible_cpu(i) { if (index == qmap->mq_map[i]) return cpu_to_node(i); } return NUMA_NO_NODE; } |
1 1 1 9 10 4 6 5 4 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2019 Mellanox Technologies. All rights reserved. */ #include <rdma/ib_verbs.h> #include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" #define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE | RDMA_COUNTER_MASK_PID) static int __counter_set_mode(struct rdma_port_counter *port_counter, enum rdma_nl_counter_mode new_mode, enum rdma_nl_counter_mask new_mask) { if (new_mode == RDMA_COUNTER_MODE_AUTO) { if (new_mask & (~ALL_AUTO_MODE_MASKS)) return -EINVAL; if (port_counter->num_counters) return -EBUSY; } port_counter->mode.mode = new_mode; port_counter->mode.mask = new_mask; return 0; } /* * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode * * @dev: Device to operate * @port: Port to use * @mask: Mask to configure * @extack: Message to the user * * Return 0 on success. If counter mode wasn't changed then it is considered * as success as well. * Return -EBUSY when changing to auto mode while there are bounded counters. * */ int rdma_counter_set_auto_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mask mask, struct netlink_ext_ack *extack) { struct rdma_port_counter *port_counter; enum rdma_nl_counter_mode mode; int ret; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return -EOPNOTSUPP; mutex_lock(&port_counter->lock); if (mask) mode = RDMA_COUNTER_MODE_AUTO; else mode = (port_counter->num_counters) ? RDMA_COUNTER_MODE_MANUAL : RDMA_COUNTER_MODE_NONE; if (port_counter->mode.mode == mode && port_counter->mode.mask == mask) { ret = 0; goto out; } ret = __counter_set_mode(port_counter, mode, mask); out: mutex_unlock(&port_counter->lock); if (ret == -EBUSY) NL_SET_ERR_MSG( extack, "Modifying auto mode is not allowed when there is a bound QP"); return ret; } static void auto_mode_init_counter(struct rdma_counter *counter, const struct ib_qp *qp, enum rdma_nl_counter_mask new_mask) { struct auto_mode_param *param = &counter->mode.param; counter->mode.mode = RDMA_COUNTER_MODE_AUTO; counter->mode.mask = new_mask; if (new_mask & RDMA_COUNTER_MASK_QP_TYPE) param->qp_type = qp->qp_type; } static int __rdma_counter_bind_qp(struct rdma_counter *counter, struct ib_qp *qp) { int ret; if (qp->counter) return -EINVAL; if (!qp->device->ops.counter_bind_qp) return -EOPNOTSUPP; mutex_lock(&counter->lock); ret = qp->device->ops.counter_bind_qp(counter, qp); mutex_unlock(&counter->lock); return ret; } int rdma_counter_modify(struct ib_device *dev, u32 port, unsigned int index, bool enable) { struct rdma_hw_stats *stats; int ret = 0; if (!dev->ops.modify_hw_stat) return -EOPNOTSUPP; stats = ib_get_hw_stats_port(dev, port); if (!stats || index >= stats->num_counters || !(stats->descs[index].flags & IB_STAT_FLAG_OPTIONAL)) return -EINVAL; mutex_lock(&stats->lock); if (enable != test_bit(index, stats->is_disabled)) goto out; ret = dev->ops.modify_hw_stat(dev, port, index, enable); if (ret) goto out; if (enable) clear_bit(index, stats->is_disabled); else set_bit(index, stats->is_disabled); out: mutex_unlock(&stats->lock); return ret; } static struct rdma_counter *alloc_and_bind(struct ib_device *dev, u32 port, struct ib_qp *qp, enum rdma_nl_counter_mode mode) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; int ret; if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) return NULL; counter = kzalloc(sizeof(*counter), GFP_KERNEL); if (!counter) return NULL; counter->device = dev; counter->port = port; rdma_restrack_new(&counter->res, RDMA_RESTRACK_COUNTER); counter->stats = dev->ops.counter_alloc_stats(counter); if (!counter->stats) goto err_stats; port_counter = &dev->port_data[port].port_counter; mutex_lock(&port_counter->lock); switch (mode) { case RDMA_COUNTER_MODE_MANUAL: ret = __counter_set_mode(port_counter, RDMA_COUNTER_MODE_MANUAL, 0); if (ret) { mutex_unlock(&port_counter->lock); goto err_mode; } break; case RDMA_COUNTER_MODE_AUTO: auto_mode_init_counter(counter, qp, port_counter->mode.mask); break; default: ret = -EOPNOTSUPP; mutex_unlock(&port_counter->lock); goto err_mode; } port_counter->num_counters++; mutex_unlock(&port_counter->lock); counter->mode.mode = mode; kref_init(&counter->kref); mutex_init(&counter->lock); ret = __rdma_counter_bind_qp(counter, qp); if (ret) goto err_mode; rdma_restrack_parent_name(&counter->res, &qp->res); rdma_restrack_add(&counter->res); return counter; err_mode: rdma_free_hw_stats_struct(counter->stats); err_stats: rdma_restrack_put(&counter->res); kfree(counter); return NULL; } static void rdma_counter_free(struct rdma_counter *counter) { struct rdma_port_counter *port_counter; port_counter = &counter->device->port_data[counter->port].port_counter; mutex_lock(&port_counter->lock); port_counter->num_counters--; if (!port_counter->num_counters && (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) __counter_set_mode(port_counter, RDMA_COUNTER_MODE_NONE, 0); mutex_unlock(&port_counter->lock); rdma_restrack_del(&counter->res); rdma_free_hw_stats_struct(counter->stats); kfree(counter); } static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, enum rdma_nl_counter_mask auto_mask) { struct auto_mode_param *param = &counter->mode.param; bool match = true; if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) match &= (param->qp_type == qp->qp_type); if (auto_mask & RDMA_COUNTER_MASK_PID) match &= (task_pid_nr(counter->res.task) == task_pid_nr(qp->res.task)); return match; } static int __rdma_counter_unbind_qp(struct ib_qp *qp) { struct rdma_counter *counter = qp->counter; int ret; if (!qp->device->ops.counter_unbind_qp) return -EOPNOTSUPP; mutex_lock(&counter->lock); ret = qp->device->ops.counter_unbind_qp(qp); mutex_unlock(&counter->lock); return ret; } static void counter_history_stat_update(struct rdma_counter *counter) { struct ib_device *dev = counter->device; struct rdma_port_counter *port_counter; int i; port_counter = &dev->port_data[counter->port].port_counter; if (!port_counter->hstats) return; rdma_counter_query_stats(counter); for (i = 0; i < counter->stats->num_counters; i++) port_counter->hstats->value[i] += counter->stats->value[i]; } /* * rdma_get_counter_auto_mode - Find the counter that @qp should be bound * with in auto mode * * Return: The counter (with ref-count increased) if found */ static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp, u32 port) { struct rdma_port_counter *port_counter; struct rdma_counter *counter = NULL; struct ib_device *dev = qp->device; struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; unsigned long id = 0; port_counter = &dev->port_data[port].port_counter; rt = &dev->res[RDMA_RESTRACK_COUNTER]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { counter = container_of(res, struct rdma_counter, res); if ((counter->device != qp->device) || (counter->port != port)) goto next; if (auto_mode_match(qp, counter, port_counter->mode.mask)) break; next: counter = NULL; } if (counter && !kref_get_unless_zero(&counter->kref)) counter = NULL; xa_unlock(&rt->xa); return counter; } static void counter_release(struct kref *kref) { struct rdma_counter *counter; counter = container_of(kref, struct rdma_counter, kref); counter_history_stat_update(counter); counter->device->ops.counter_dealloc(counter); rdma_counter_free(counter); } /* * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on * the auto-mode rule */ int rdma_counter_bind_qp_auto(struct ib_qp *qp, u32 port) { struct rdma_port_counter *port_counter; struct ib_device *dev = qp->device; struct rdma_counter *counter; int ret; if (!rdma_restrack_is_tracked(&qp->res) || rdma_is_kernel_res(&qp->res)) return 0; if (!rdma_is_port_valid(dev, port)) return -EINVAL; port_counter = &dev->port_data[port].port_counter; if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) return 0; counter = rdma_get_counter_auto_mode(qp, port); if (counter) { ret = __rdma_counter_bind_qp(counter, qp); if (ret) { kref_put(&counter->kref, counter_release); return ret; } } else { counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_AUTO); if (!counter) return -ENOMEM; } return 0; } /* * rdma_counter_unbind_qp - Unbind a qp from a counter * @force: * true - Decrease the counter ref-count anyway (e.g., qp destroy) */ int rdma_counter_unbind_qp(struct ib_qp *qp, bool force) { struct rdma_counter *counter = qp->counter; int ret; if (!counter) return -EINVAL; ret = __rdma_counter_unbind_qp(qp); if (ret && !force) return ret; kref_put(&counter->kref, counter_release); return 0; } int rdma_counter_query_stats(struct rdma_counter *counter) { struct ib_device *dev = counter->device; int ret; if (!dev->ops.counter_update_stats) return -EINVAL; mutex_lock(&counter->lock); ret = dev->ops.counter_update_stats(counter); mutex_unlock(&counter->lock); return ret; } static u64 get_running_counters_hwstat_sum(struct ib_device *dev, u32 port, u32 index) { struct rdma_restrack_entry *res; struct rdma_restrack_root *rt; struct rdma_counter *counter; unsigned long id = 0; u64 sum = 0; rt = &dev->res[RDMA_RESTRACK_COUNTER]; xa_lock(&rt->xa); xa_for_each(&rt->xa, id, res) { if (!rdma_restrack_get(res)) continue; xa_unlock(&rt->xa); counter = container_of(res, struct rdma_counter, res); if ((counter->device != dev) || (counter->port != port) || rdma_counter_query_stats(counter)) goto next; sum += counter->stats->value[index]; next: xa_lock(&rt->xa); rdma_restrack_put(res); } xa_unlock(&rt->xa); return sum; } /* * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a * specific port, including the running ones and history data */ u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u32 port, u32 index) { struct rdma_port_counter *port_counter; u64 sum; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return 0; sum = get_running_counters_hwstat_sum(dev, port, index); sum += port_counter->hstats->value[index]; return sum; } static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) { struct rdma_restrack_entry *res = NULL; struct ib_qp *qp = NULL; res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num); if (IS_ERR(res)) return NULL; qp = container_of(res, struct ib_qp, res); if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) goto err; return qp; err: rdma_restrack_put(res); return NULL; } static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev, u32 counter_id) { struct rdma_restrack_entry *res; struct rdma_counter *counter; res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id); if (IS_ERR(res)) return NULL; counter = container_of(res, struct rdma_counter, res); kref_get(&counter->kref); rdma_restrack_put(res); return counter; } /* * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id */ int rdma_counter_bind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; struct ib_qp *qp; int ret; port_counter = &dev->port_data[port].port_counter; if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; counter = rdma_get_counter_by_id(dev, counter_id); if (!counter) { ret = -ENOENT; goto err; } if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) { ret = -EINVAL; goto err_task; } if ((counter->device != qp->device) || (counter->port != qp->port)) { ret = -EINVAL; goto err_task; } ret = __rdma_counter_bind_qp(counter, qp); if (ret) goto err_task; rdma_restrack_put(&qp->res); return 0; err_task: kref_put(&counter->kref, counter_release); err: rdma_restrack_put(&qp->res); return ret; } /* * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it * The id of new counter is returned in @counter_id */ int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u32 port, u32 qp_num, u32 *counter_id) { struct rdma_port_counter *port_counter; struct rdma_counter *counter; struct ib_qp *qp; int ret; if (!rdma_is_port_valid(dev, port)) return -EINVAL; port_counter = &dev->port_data[port].port_counter; if (!port_counter->hstats) return -EOPNOTSUPP; if (port_counter->mode.mode == RDMA_COUNTER_MODE_AUTO) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { ret = -EINVAL; goto err; } counter = alloc_and_bind(dev, port, qp, RDMA_COUNTER_MODE_MANUAL); if (!counter) { ret = -ENOMEM; goto err; } if (counter_id) *counter_id = counter->id; rdma_restrack_put(&qp->res); return 0; err: rdma_restrack_put(&qp->res); return ret; } /* * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter */ int rdma_counter_unbind_qpn(struct ib_device *dev, u32 port, u32 qp_num, u32 counter_id) { struct rdma_port_counter *port_counter; struct ib_qp *qp; int ret; if (!rdma_is_port_valid(dev, port)) return -EINVAL; qp = rdma_counter_get_qp(dev, qp_num); if (!qp) return -ENOENT; if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { ret = -EINVAL; goto out; } port_counter = &dev->port_data[port].port_counter; if (!qp->counter || qp->counter->id != counter_id || port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) { ret = -EINVAL; goto out; } ret = rdma_counter_unbind_qp(qp, false); out: rdma_restrack_put(&qp->res); return ret; } int rdma_counter_get_mode(struct ib_device *dev, u32 port, enum rdma_nl_counter_mode *mode, enum rdma_nl_counter_mask *mask) { struct rdma_port_counter *port_counter; port_counter = &dev->port_data[port].port_counter; *mode = port_counter->mode.mode; *mask = port_counter->mode.mask; return 0; } void rdma_counter_init(struct ib_device *dev) { struct rdma_port_counter *port_counter; u32 port, i; if (!dev->port_data) return; rdma_for_each_port(dev, port) { port_counter = &dev->port_data[port].port_counter; port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; mutex_init(&port_counter->lock); if (!dev->ops.alloc_hw_port_stats) continue; port_counter->hstats = dev->ops.alloc_hw_port_stats(dev, port); if (!port_counter->hstats) goto fail; } return; fail: for (i = port; i >= rdma_start_port(dev); i--) { port_counter = &dev->port_data[port].port_counter; rdma_free_hw_stats_struct(port_counter->hstats); port_counter->hstats = NULL; mutex_destroy(&port_counter->lock); } } void rdma_counter_release(struct ib_device *dev) { struct rdma_port_counter *port_counter; u32 port; rdma_for_each_port(dev, port) { port_counter = &dev->port_data[port].port_counter; rdma_free_hw_stats_struct(port_counter->hstats); mutex_destroy(&port_counter->lock); } } |
76 100 154 181 85 81 22 27 61 74 43 30 98 249 249 157 88 51 83 87 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> (C) 2002 David Woodhouse <dwmw2@infradead.org> (C) 2012 Michel Lespinasse <walken@google.com> linux/include/linux/rbtree_augmented.h */ #ifndef _LINUX_RBTREE_AUGMENTED_H #define _LINUX_RBTREE_AUGMENTED_H #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> /* * Please note - only struct rb_augment_callbacks and the prototypes for * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { void (*propagate)(struct rb_node *node, struct rb_node *stop); void (*copy)(struct rb_node *old, struct rb_node *new); void (*rotate)(struct rb_node *old, struct rb_node *new); }; extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); /* * Fixup the rbtree and update the augmented information when rebalancing. * * On insertion, the user must update the augmented information on the path * leading to the inserted node, then call rb_link_node() as usual and * rb_insert_augmented() instead of the usual rb_insert_color() call. * If rb_insert_augmented() rebalances the rbtree, it will callback into * a user provided function to update the augmented information on the * affected subtrees. */ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { __rb_insert_augmented(node, root, augment->rotate); } static inline void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *root, bool newleft, const struct rb_augment_callbacks *augment) { if (newleft) root->rb_leftmost = node; rb_insert_augmented(node, &root->rb_root, augment); } static __always_inline struct rb_node * rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *), const struct rb_augment_callbacks *augment) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); augment->propagate(parent, NULL); /* suboptimal */ rb_insert_augmented_cached(node, tree, leftmost, augment); return leftmost ? node : NULL; } /* * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ #define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ if (RBCOMPUTE(node, true)) \ break; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ .copy = RBNAME ## _copy, \ .rotate = RBNAME ## _rotate \ }; /* * Template for declaring augmented rbtree callbacks, * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBTYPE: type of the RBAUGMENTED field * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar */ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ if (node->RBFIELD.rb_left) { \ child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (node->RBFIELD.rb_right) { \ child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (exit && node->RBAUGMENTED == max) \ return true; \ node->RBAUGMENTED = max; \ return false; \ } \ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 #define RB_BLACK 1 #define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) #define __rb_color(pc) ((pc) & 1) #define __rb_is_black(pc) __rb_color(pc) #define __rb_is_red(pc) (!__rb_color(pc)) #define rb_color(rb) __rb_color((rb)->__rb_parent_color) #define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) #define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->__rb_parent_color = rb_color(rb) + (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { rb->__rb_parent_color = (unsigned long)p + color; } static inline void __rb_change_child(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) WRITE_ONCE(parent->rb_left, new); else WRITE_ONCE(parent->rb_right, new); } else WRITE_ONCE(root->rb_node, new); } static inline void __rb_change_child_rcu(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) rcu_assign_pointer(parent->rb_left, new); else rcu_assign_pointer(parent->rb_right, new); } else rcu_assign_pointer(root->rb_node, new); } extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; struct rb_node *tmp = node->rb_left; struct rb_node *parent, *rebalance; unsigned long pc; if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) * * Note that if there is one child it must be red due to 5) * and node must be black due to 4). We adjust colors locally * so as to bypass __rb_erase_color() later on. */ pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, child, parent, root); if (child) { child->__rb_parent_color = pc; rebalance = NULL; } else rebalance = __rb_is_black(pc) ? parent : NULL; tmp = parent; } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ tmp->__rb_parent_color = pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, tmp, parent, root); rebalance = NULL; tmp = parent; } else { struct rb_node *successor = child, *child2; tmp = child->rb_left; if (!tmp) { /* * Case 2: node's successor is its right child * * (n) (s) * / \ / \ * (x) (s) -> (x) (c) * \ * (c) */ parent = successor; child2 = successor->rb_right; augment->copy(node, successor); } else { /* * Case 3: node's successor is leftmost under * node's right child subtree * * (n) (s) * / \ / \ * (x) (y) -> (x) (y) * / / * (p) (p) * / / * (s) (c) * \ * (c) */ do { parent = successor; successor = tmp; tmp = tmp->rb_left; } while (tmp); child2 = successor->rb_right; WRITE_ONCE(parent->rb_left, child2); WRITE_ONCE(successor->rb_right, child); rb_set_parent(child, successor); augment->copy(node, successor); augment->propagate(parent, successor); } tmp = node->rb_left; WRITE_ONCE(successor->rb_left, tmp); rb_set_parent(tmp, successor); pc = node->__rb_parent_color; tmp = __rb_parent(pc); __rb_change_child(node, successor, tmp, root); if (child2) { rb_set_parent_color(child2, parent, RB_BLACK); rebalance = NULL; } else { rebalance = rb_is_black(successor) ? parent : NULL; } successor->__rb_parent_color = pc; tmp = successor; } augment->propagate(tmp, NULL); return rebalance; } static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } static __always_inline void rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, const struct rb_augment_callbacks *augment) { if (root->rb_leftmost == node) root->rb_leftmost = rb_next(node); rb_erase_augmented(node, &root->rb_root, augment); } #endif /* _LINUX_RBTREE_AUGMENTED_H */ |
70 14 189 70 56 14 70 69 14 56 64 63 10 54 68 70 66 63 69 69 137 135 135 137 137 138 66 75 137 1 1 9 1 8 9 9 6 6 6 6 6 5 1 9 9 9 9 9 8 1 9 9 1 8 9 9 6 6 6 6 233 232 233 233 231 233 2 2 2 3 2 3 9 9 9 8 1 8 9 507 510 50 508 506 498 9 507 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/file.c - kernfs file implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/uio.h> #include "kernfs-internal.h" struct kernfs_open_node { struct rcu_head rcu_head; atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ unsigned int nr_mmapped; unsigned int nr_to_release; }; /* * kernfs_notify() may be called from any context and bounces notifications * through a work item. To minimize space overhead in kernfs_node, the * pending queue is implemented as a singly linked list of kernfs_nodes. * The list is terminated with the self pointer so that whether a * kernfs_node is on the list or not can be determined by testing the next * pointer for %NULL. */ #define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) static DEFINE_SPINLOCK(kernfs_notify_lock); static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn) { int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS); return &kernfs_locks->open_file_mutex[idx]; } static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn) { struct mutex *lock; lock = kernfs_open_file_mutex_ptr(kn); mutex_lock(lock); return lock; } /** * of_on - Get the kernfs_open_node of the specified kernfs_open_file * @of: target kernfs_open_file * * Return: the kernfs_open_node of the kernfs_open_file */ static struct kernfs_open_node *of_on(struct kernfs_open_file *of) { return rcu_dereference_protected(of->kn->attr.open, !list_empty(&of->list)); } /** * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * * @kn: target kernfs_node. * * Fetch and return ->attr.open of @kn when caller holds the * kernfs_open_file_mutex_ptr(kn). * * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when * the caller guarantees that this mutex is being held, other updaters can't * change ->attr.open and this means that we can safely deref ->attr.open * outside RCU read-side critical section. * * The caller needs to make sure that kernfs_open_file_mutex is held. * * Return: @kn->attr.open when kernfs_open_file_mutex is held. */ static struct kernfs_open_node * kernfs_deref_open_node_locked(struct kernfs_node *kn) { return rcu_dereference_protected(kn->attr.open, lockdep_is_held(kernfs_open_file_mutex_ptr(kn))); } static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; } /* * Determine the kernfs_ops for the given kernfs_node. This function must * be called while holding an active reference. */ static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) { if (kn->flags & KERNFS_LOCKDEP) lockdep_assert_held(kn); return kn->attr.ops; } /* * As kernfs_seq_stop() is also called after kernfs_seq_start() or * kernfs_seq_next() failure, it needs to distinguish whether it's stopping * a seq_file iteration which is fully initialized with an active reference * or an aborted kernfs_seq_start() due to get_active failure. The * position pointer is the only context for each seq_file iteration and * thus the stop condition should be encoded in it. As the return value is * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable * choice to indicate get_active failure. * * Unfortunately, this is complicated due to the optional custom seq_file * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or * custom seq_file operations and thus can't decide whether put_active * should be performed or not only on ERR_PTR(-ENODEV). * * This is worked around by factoring out the custom seq_stop() and * put_active part into kernfs_seq_stop_active(), skipping it from * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures * that kernfs_seq_stop_active() is skipped only after get_active failure. */ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_stop) ops->seq_stop(sf, v); kernfs_put_active(of->kn); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); if (ops->seq_start) { void *next = ops->seq_start(sf, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } return single_start(sf, ppos); } static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_next) { void *next = ops->seq_next(sf, v, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } else { /* * The same behavior and code as single_open(), always * terminate after the initial read. */ ++*ppos; return NULL; } } static void kernfs_seq_stop(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; if (v != ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, v); mutex_unlock(&of->mutex); } static int kernfs_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; of->event = atomic_read(&of_on(of)->event); return of->kn->attr.ops->seq_show(sf, v); } static const struct seq_operations kernfs_seq_ops = { .start = kernfs_seq_start, .next = kernfs_seq_next, .stop = kernfs_seq_stop, .show = kernfs_seq_show, }; /* * As reading a bin file can have side-effects, the exact offset and bytes * specified in read(2) call should be passed to the read callback making * it difficult to use seq_file. Implement simplistic custom buffering for * bin files. */ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; char *buf; buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; } of->event = atomic_read(&of_on(of)->event); ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len < 0) goto out_free; if (copy_to_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) { if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW) return seq_read_iter(iocb, iter); return kernfs_file_read_iter(iocb, iter); } /* * Copy data in from userland and pass it to the matching kernfs write * operation. * * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come on * the first write. Hint: if you're writing a value, first read the file, * modify only the value you're changing, then write entire buffer * back. */ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = iov_iter_count(iter); const struct kernfs_ops *ops; char *buf; if (of->atomic_write_len) { if (len > of->atomic_write_len) return -E2BIG; } else { len = min_t(size_t, len, PAGE_SIZE); } buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } buf[len] = '\0'; /* guarantee string termination */ /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; } ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len > 0) iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static void kernfs_vma_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); if (!of->vm_ops) return; if (!kernfs_get_active(of->kn)) return; if (of->vm_ops->open) of->vm_ops->open(vma); kernfs_put_active(of->kn); } static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); kernfs_put_active(of->kn); return ret; } static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = 0; if (of->vm_ops->page_mkwrite) ret = of->vm_ops->page_mkwrite(vmf); else file_update_time(file); kernfs_put_active(of->kn); return ret; } static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return -EINVAL; if (!kernfs_get_active(of->kn)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); kernfs_put_active(of->kn); return ret; } static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; int rc; /* * mmap path and of->mutex are prone to triggering spurious lockdep * warnings and we don't want to add spurious locking dependency * between the two. Check whether mmap is actually implemented * without grabbing @of->mutex by testing HAS_MMAP flag. See the * comment in kernfs_fop_open() for more details. */ if (!(of->kn->flags & KERNFS_HAS_MMAP)) return -ENODEV; mutex_lock(&of->mutex); rc = -ENODEV; if (!kernfs_get_active(of->kn)) goto out_unlock; ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); if (rc) goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() * to satisfy versions of X which crash if the mmap fails: that * substitutes a new vm_file, and we don't then want bin_vm_ops. */ if (vma->vm_file != file) goto out_put; rc = -EINVAL; if (of->mmapped && of->vm_ops != vma->vm_ops) goto out_put; /* * It is not possible to successfully wrap close. * So error if someone is trying to use close. */ if (vma->vm_ops && vma->vm_ops->close) goto out_put; rc = 0; if (!of->mmapped) { of->mmapped = true; of_on(of)->nr_mmapped++; of->vm_ops = vma->vm_ops; } vma->vm_ops = &kernfs_vm_ops; out_put: kernfs_put_active(of->kn); out_unlock: mutex_unlock(&of->mutex); return rc; } /** * kernfs_get_open_node - get or create kernfs_open_node * @kn: target kernfs_node * @of: kernfs_open_file for this instance of open * * If @kn->attr.open exists, increment its reference count; otherwise, * create one. @of is chained to the files list. * * Locking: * Kernel thread context (may sleep). * * Return: * %0 on success, -errno on failure. */ static int kernfs_get_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { /* not there, initialize a new one */ on = kzalloc(sizeof(*on), GFP_KERNEL); if (!on) { mutex_unlock(mutex); return -ENOMEM; } atomic_set(&on->event, 1); init_waitqueue_head(&on->poll); INIT_LIST_HEAD(&on->files); rcu_assign_pointer(kn->attr.open, on); } list_add_tail(&of->list, &on->files); if (kn->flags & KERNFS_HAS_RELEASE) on->nr_to_release++; mutex_unlock(mutex); return 0; } /** * kernfs_unlink_open_file - Unlink @of from @kn. * * @kn: target kernfs_node * @of: associated kernfs_open_file * @open_failed: ->open() failed, cancel ->release() * * Unlink @of from list of @kn's associated open files. If list of * associated open files becomes empty, disassociate and free * kernfs_open_node. * * LOCKING: * None. */ static void kernfs_unlink_open_file(struct kernfs_node *kn, struct kernfs_open_file *of, bool open_failed) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } if (of) { if (kn->flags & KERNFS_HAS_RELEASE) { WARN_ON_ONCE(of->released == open_failed); if (open_failed) on->nr_to_release--; } if (of->mmapped) on->nr_mmapped--; list_del(&of->list); } if (list_empty(&on->files)) { rcu_assign_pointer(kn->attr.open, NULL); kfree_rcu(on, rcu_head); } mutex_unlock(mutex); } static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; int error = -EACCES; if (!kernfs_get_active(kn)) return -ENODEV; ops = kernfs_ops(kn); has_read = ops->seq_show || ops->read || ops->mmap; has_write = ops->write || ops->mmap; has_mmap = ops->mmap; /* see the flag definition for details */ if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { if ((file->f_mode & FMODE_WRITE) && (!(inode->i_mode & S_IWUGO) || !has_write)) goto err_out; if ((file->f_mode & FMODE_READ) && (!(inode->i_mode & S_IRUGO) || !has_read)) goto err_out; } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); if (!of) goto err_out; /* * The following is done to give a different lockdep key to * @of->mutex for files which implement mmap. This is a rather * crude way to avoid false positive lockdep warning around * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under * which mm->mmap_lock nests, while holding @of->mutex. As each * open file has a separate mutex, it's okay as long as those don't * happen on the same file. At this point, we can't easily give * each file a separate locking class. Let's differentiate on * whether the file has mmap or not for now. * * For similar reasons, writable and readonly files are given different * lockdep key, because the writable file /sys/power/resume may call vfs * lookup helpers for arbitrary paths and readonly files can be read by * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs. * * All three cases look the same. They're supposed to * look that way and give @of->mutex different static lockdep keys. */ if (has_mmap) mutex_init(&of->mutex); else if (file->f_mode & FMODE_WRITE) mutex_init(&of->mutex); else mutex_init(&of->mutex); of->kn = kn; of->file = file; /* * Write path needs to atomic_write_len outside active reference. * Cache it in open_file. See kernfs_fop_write_iter() for details. */ of->atomic_write_len = ops->atomic_write_len; error = -EINVAL; /* * ->seq_show is incompatible with ->prealloc, * as seq_read does its own allocation. * ->read must be used instead. */ if (ops->prealloc && ops->seq_show) goto err_free; if (ops->prealloc) { int len = of->atomic_write_len ?: PAGE_SIZE; of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); error = -ENOMEM; if (!of->prealloc_buf) goto err_free; mutex_init(&of->prealloc_mutex); } /* * Always instantiate seq_file even if read access doesn't use * seq_file or is not requested. This unifies private data access * and readable regular files are the vast majority anyway. */ if (ops->seq_show) error = seq_open(file, &kernfs_seq_ops); else error = seq_open(file, NULL); if (error) goto err_free; of->seq_file = file->private_data; of->seq_file->private = of; /* seq_file clears PWRITE unconditionally, restore it if WRITE */ if (file->f_mode & FMODE_WRITE) file->f_mode |= FMODE_PWRITE; /* make sure we have open node struct */ error = kernfs_get_open_node(kn, of); if (error) goto err_seq_release; if (ops->open) { /* nobody has access to @of yet, skip @of->mutex */ error = ops->open(of); if (error) goto err_put_node; } /* open succeeded, put active references */ kernfs_put_active(kn); return 0; err_put_node: kernfs_unlink_open_file(kn, of, true); err_seq_release: seq_release(inode, file); err_free: kfree(of->prealloc_buf); kfree(of); err_out: kernfs_put_active(kn); return error; } /* used from release/drain to ensure that ->release() is called exactly once */ static void kernfs_release_file(struct kernfs_node *kn, struct kernfs_open_file *of) { /* * @of is guaranteed to have no other file operations in flight and * we just want to synchronize release and drain paths. * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used * here because drain path may be called from places which can * cause circular dependency. */ lockdep_assert_held(kernfs_open_file_mutex_ptr(kn)); if (!of->released) { /* * A file is never detached without being released and we * need to be able to release files which are deactivated * and being drained. Don't use kernfs_ops(). */ kn->attr.ops->release(of); of->released = true; of_on(of)->nr_to_release--; } } static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); if (kn->flags & KERNFS_HAS_RELEASE) { struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); kernfs_release_file(kn, of); mutex_unlock(mutex); } kernfs_unlink_open_file(kn, of, false); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); return 0; } bool kernfs_should_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; bool ret; /* * @kn being deactivated guarantees that @kn->attr.open can't change * beneath us making the lockless test below safe. */ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); rcu_read_lock(); on = rcu_dereference(kn->attr.open); ret = on && (on->nr_mmapped || on->nr_to_release); rcu_read_unlock(); return ret; } void kernfs_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; struct kernfs_open_file *of; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); if (of->mmapped) { unmap_mapping_range(inode->i_mapping, 0, 0, 1); of->mmapped = false; on->nr_mmapped--; } if (kn->flags & KERNFS_HAS_RELEASE) kernfs_release_file(kn, of); } WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release); mutex_unlock(mutex); } /* * Kernfs attribute files are pollable. The idea is that you read * the content and then you use 'poll' or 'select' to wait for * the content to change. When the content changes (assuming the * manager for the kobject supports notification), poll will * return EPOLLERR|EPOLLPRI, and select will return the fd whether * it is waiting for read, write, or exceptions. * Once poll/select indicates that the value has changed, you * need to close and re-open the file, or seek to 0 and read again. * Reminder: this only works for attributes which actively support * it, and it is not possible to test an attribute from userspace * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) { struct kernfs_open_node *on = of_on(of); poll_wait(of->file, &on->poll, wait); if (of->event != atomic_read(&on->event)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; return DEFAULT_POLLMASK; } static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); __poll_t ret; if (!kernfs_get_active(kn)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; if (kn->attr.ops->poll) ret = kn->attr.ops->poll(of, wait); else ret = kernfs_generic_poll(of, wait); kernfs_put_active(kn); return ret; } static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; loff_t ret; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { mutex_unlock(&of->mutex); return -ENODEV; } ops = kernfs_ops(of->kn); if (ops->llseek) ret = ops->llseek(of, offset, whence); else ret = generic_file_llseek(file, offset, whence); kernfs_put_active(of->kn); mutex_unlock(&of->mutex); return ret; } static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; struct kernfs_super_info *info; struct kernfs_root *root; repeat: /* pop one off the notify_list */ spin_lock_irq(&kernfs_notify_lock); kn = kernfs_notify_list; if (kn == KERNFS_NOTIFY_EOL) { spin_unlock_irq(&kernfs_notify_lock); return; } kernfs_notify_list = kn->attr.notify_next; kn->attr.notify_next = NULL; spin_unlock_irq(&kernfs_notify_lock); root = kernfs_root(kn); /* kick fsnotify */ down_read(&root->kernfs_supers_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; struct inode *p_inode = NULL; struct inode *inode; struct qstr name; /* * We want fsnotify_modify() on @kn but as the * modifications aren't originating from userland don't * have the matching @file available. Look up the inodes * and generate the events manually. */ inode = ilookup(info->sb, kernfs_ino(kn)); if (!inode) continue; name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name)); parent = kernfs_get_parent(kn); if (parent) { p_inode = ilookup(info->sb, kernfs_ino(parent)); if (p_inode) { fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, p_inode, &name, inode, 0); iput(p_inode); } kernfs_put(parent); } if (!p_inode) fsnotify_inode(inode, FS_MODIFY); iput(inode); } up_read(&root->kernfs_supers_rwsem); kernfs_put(kn); goto repeat; } /** * kernfs_notify - notify a kernfs file * @kn: file to notify * * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any * context. */ void kernfs_notify(struct kernfs_node *kn) { static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); unsigned long flags; struct kernfs_open_node *on; if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) return; /* kick poll immediately */ rcu_read_lock(); on = rcu_dereference(kn->attr.open); if (on) { atomic_inc(&on->event); wake_up_interruptible(&on->poll); } rcu_read_unlock(); /* schedule work to kick fsnotify */ spin_lock_irqsave(&kernfs_notify_lock, flags); if (!kn->attr.notify_next) { kernfs_get(kn); kn->attr.notify_next = kernfs_notify_list; kernfs_notify_list = kn; schedule_work(&kernfs_notify_work); } spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { .read_iter = kernfs_fop_read_iter, .write_iter = kernfs_fop_write_iter, .llseek = kernfs_fop_llseek, .mmap = kernfs_fop_mmap, .open = kernfs_fop_open, .release = kernfs_fop_release, .poll = kernfs_fop_poll, .fsync = noop_fsync, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, }; /** * __kernfs_create_file - kernfs internal function to create a file * @parent: directory to create the file in * @name: name of the file * @mode: mode of the file * @uid: uid of the file * @gid: gid of the file * @size: size of the file * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Return: the created node on success, ERR_PTR() value on error. */ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { struct kernfs_node *kn; unsigned flags; int rc; flags = KERNFS_FILE; kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, uid, gid, flags); if (!kn) return ERR_PTR(-ENOMEM); kn->attr.ops = ops; kn->attr.size = size; kn->ns = ns; kn->priv = priv; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (key) { lockdep_init_map(&kn->dep_map, "kn->active", key, 0); kn->flags |= KERNFS_LOCKDEP; } #endif /* * kn->attr.ops is accessible only while holding active ref. We * need to know whether some ops are implemented outside active * ref. Cache their existence in flags. */ if (ops->seq_show) kn->flags |= KERNFS_HAS_SEQ_SHOW; if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; if (ops->release) kn->flags |= KERNFS_HAS_RELEASE; rc = kernfs_add_one(kn); if (rc) { kernfs_put(kn); return ERR_PTR(rc); } return kn; } |
4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM block #if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BLOCK_H #include <linux/blktrace_api.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/tracepoint.h> #include <uapi/linux/ioprio.h> #define RWBS_LEN 8 #define IOPRIO_CLASS_STRINGS \ { IOPRIO_CLASS_NONE, "none" }, \ { IOPRIO_CLASS_RT, "rt" }, \ { IOPRIO_CLASS_BE, "be" }, \ { IOPRIO_CLASS_IDLE, "idle" }, \ { IOPRIO_CLASS_INVALID, "invalid"} #ifdef CONFIG_BUFFER_HEAD DECLARE_EVENT_CLASS(block_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh), TP_STRUCT__entry ( __field( dev_t, dev ) __field( sector_t, sector ) __field( size_t, size ) ), TP_fast_assign( __entry->dev = bh->b_bdev->bd_dev; __entry->sector = bh->b_blocknr; __entry->size = bh->b_size; ), TP_printk("%d,%d sector=%llu size=%zu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->sector, __entry->size ) ); /** * block_touch_buffer - mark a buffer accessed * @bh: buffer_head being touched * * Called from touch_buffer(). */ DEFINE_EVENT(block_buffer, block_touch_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh) ); /** * block_dirty_buffer - mark a buffer dirty * @bh: buffer_head being dirtied * * Called from mark_buffer_dirty(). */ DEFINE_EVENT(block_buffer, block_dirty_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh) ); #endif /* CONFIG_BUFFER_HEAD */ /** * block_rq_requeue - place block IO request back on a queue * @rq: block IO operation request * * The block operation request @rq is being placed back into queue * @q. For some reason the request was not completed and needs to be * put back in the queue. */ TRACE_EVENT(block_rq_requeue, TP_PROTO(struct request *rq), TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( unsigned short, ioprio ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->ioprio = rq->ioprio; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u %s,%u,%u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio), IOPRIO_CLASS_STRINGS), IOPRIO_PRIO_HINT(__entry->ioprio), IOPRIO_PRIO_LEVEL(__entry->ioprio), 0) ); DECLARE_EVENT_CLASS(block_rq_completion, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( int , error ) __field( unsigned short, ioprio ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; __entry->error = blk_status_to_errno(error); __entry->ioprio = rq->ioprio; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u %s,%u,%u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio), IOPRIO_CLASS_STRINGS), IOPRIO_PRIO_HINT(__entry->ioprio), IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->error) ); /** * block_rq_complete - block IO operation completed by device driver * @rq: block operations request * @error: status code * @nr_bytes: number of completed bytes * * The block_rq_complete tracepoint event indicates that some portion * of operation request has been completed by the device driver. If * the @rq->bio is %NULL, then there is absolutely no additional work to * do for the request. If @rq->bio is non-NULL then there is * additional work required to complete the request. */ DEFINE_EVENT(block_rq_completion, block_rq_complete, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes) ); /** * block_rq_error - block IO operation error reported by device driver * @rq: block operations request * @error: status code * @nr_bytes: number of completed bytes * * The block_rq_error tracepoint event indicates that some portion * of operation request has failed as reported by the device driver. */ DEFINE_EVENT(block_rq_completion, block_rq_error, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes) ); DECLARE_EVENT_CLASS(block_rq, TP_PROTO(struct request *rq), TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( unsigned int, bytes ) __field( unsigned short, ioprio ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); __entry->ioprio = rq->ioprio; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %u (%s) %llu + %u %s,%u,%u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __entry->bytes, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __print_symbolic(IOPRIO_PRIO_CLASS(__entry->ioprio), IOPRIO_CLASS_STRINGS), IOPRIO_PRIO_HINT(__entry->ioprio), IOPRIO_PRIO_LEVEL(__entry->ioprio), __entry->comm) ); /** * block_rq_insert - insert block operation request into queue * @rq: block IO operation request * * Called immediately before block operation request @rq is inserted * into queue @q. The fields in the operation request @rq struct can * be examined to determine which device and sectors the pending * operation would access. */ DEFINE_EVENT(block_rq, block_rq_insert, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_rq_issue - issue pending block IO request operation to device driver * @rq: block IO operation request * * Called when block operation request @rq from queue @q is sent to a * device driver for processing. */ DEFINE_EVENT(block_rq, block_rq_issue, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_rq_merge - merge request with another one in the elevator * @rq: block IO operation request * * Called when block operation request @rq from queue @q is merged to another * request queued in the elevator. */ DEFINE_EVENT(block_rq, block_rq_merge, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_io_start - insert a request for execution * @rq: block IO operation request * * Called when block operation request @rq is queued for execution */ DEFINE_EVENT(block_rq, block_io_start, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_io_done - block IO operation request completed * @rq: block IO operation request * * Called when block operation request @rq is completed */ DEFINE_EVENT(block_rq, block_io_done, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_bio_complete - completed all work on the block operation * @q: queue holding the block operation * @bio: block operation completed * * This tracepoint indicates there is no further work to do on this * block IO operation @bio. */ TRACE_EVENT(block_bio_complete, TP_PROTO(struct request_queue *q, struct bio *bio), TP_ARGS(q, bio), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned, nr_sector ) __field( int, error ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = blk_status_to_errno(bio->bi_status); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->error) ); DECLARE_EVENT_CLASS(block_bio, TP_PROTO(struct bio *bio), TP_ARGS(bio), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->comm) ); /** * block_bio_bounce - used bounce buffer when processing block operation * @bio: block operation * * A bounce buffer was used to handle the block operation @bio in @q. * This occurs when hardware limitations prevent a direct transfer of * data between the @bio data memory area and the IO device. Use of a * bounce buffer requires extra copying of data and decreases * performance. */ DEFINE_EVENT(block_bio, block_bio_bounce, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_backmerge - merging block operation to the end of an existing operation * @bio: new block operation to merge * * Merging block request @bio to the end of an existing block request. */ DEFINE_EVENT(block_bio, block_bio_backmerge, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_frontmerge - merging block operation to the beginning of an existing operation * @bio: new block operation to merge * * Merging block IO operation @bio to the beginning of an existing block request. */ DEFINE_EVENT(block_bio, block_bio_frontmerge, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_queue - putting new block IO operation in queue * @bio: new block operation * * About to place the block IO operation @bio into queue @q. */ DEFINE_EVENT(block_bio, block_bio_queue, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_getrq - get a free request entry in queue for block IO operations * @bio: pending block IO operation (can be %NULL) * * A request struct has been allocated to handle the block IO operation @bio. */ DEFINE_EVENT(block_bio, block_getrq, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_plug - keep operations requests in request queue * @q: request queue to plug * * Plug the request queue @q. Do not allow block operation requests * to be sent to the device driver. Instead, accumulate requests in * the queue to improve throughput performance of the block device. */ TRACE_EVENT(block_plug, TP_PROTO(struct request_queue *q), TP_ARGS(q), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("[%s]", __entry->comm) ); DECLARE_EVENT_CLASS(block_unplug, TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), TP_ARGS(q, depth, explicit), TP_STRUCT__entry( __field( int, nr_rq ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->nr_rq = depth; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("[%s] %d", __entry->comm, __entry->nr_rq) ); /** * block_unplug - release of operations requests in request queue * @q: request queue to unplug * @depth: number of requests just added to the queue * @explicit: whether this was an explicit unplug, or one from schedule() * * Unplug request queue @q because device driver is scheduled to work * on elements in the request queue. */ DEFINE_EVENT(block_unplug, block_unplug, TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), TP_ARGS(q, depth, explicit) ); /** * block_split - split a single bio struct into two bio structs * @bio: block operation being split * @new_sector: The starting sector for the new bio * * The bio request @bio needs to be split into two bio requests. The newly * created @bio request starts at @new_sector. This split may be required due to * hardware limitations such as operation crossing device boundaries in a RAID * system. */ TRACE_EVENT(block_split, TP_PROTO(struct bio *bio, unsigned int new_sector), TP_ARGS(bio, new_sector), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( sector_t, new_sector ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %llu / %llu [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, (unsigned long long)__entry->new_sector, __entry->comm) ); /** * block_bio_remap - map request for a logical device to the raw device * @bio: revised operation * @dev: original device for the operation * @from: original sector for the operation * * An operation for a logical device has been mapped to the * raw block device. */ TRACE_EVENT(block_bio_remap, TP_PROTO(struct bio *bio, dev_t dev, sector_t from), TP_ARGS(bio, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), (unsigned long long)__entry->old_sector) ); /** * block_rq_remap - map request for a block operation request * @rq: block IO operation request * @dev: device for the operation * @from: original sector for the operation * * The block operation request @rq in @q has been remapped. The block * operation request @rq holds the current information and @from hold * the original sector. */ TRACE_EVENT(block_rq_remap, TP_PROTO(struct request *rq, dev_t dev, sector_t from), TP_ARGS(rq, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) __field( unsigned int, nr_bios ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = disk_devt(rq->q->disk); __entry->sector = blk_rq_pos(rq); __entry->nr_sector = blk_rq_sectors(rq); __entry->old_dev = dev; __entry->old_sector = from; __entry->nr_bios = blk_rq_count_bios(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), (unsigned long long)__entry->old_sector, __entry->nr_bios) ); #endif /* _TRACE_BLOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
6 6 3 3 4 12 9 1 9 12 10 12 6 6 6 5 4 3 3 2 1 1 6 7 1 6 1 2 2 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Intel Corporation. All rights reserved. * Copyright (C) 2014 Marvell International Ltd. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/nfc.h> #include "nfc.h" #include "llcp.h" static u8 llcp_magic[3] = {0x46, 0x66, 0x6d}; static LIST_HEAD(llcp_devices); /* Protects llcp_devices list */ static DEFINE_SPINLOCK(llcp_devices_lock); static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb); void nfc_llcp_sock_link(struct llcp_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_add_node(sk, &l->head); write_unlock(&l->lock); } void nfc_llcp_sock_unlink(struct llcp_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_del_node_init(sk); write_unlock(&l->lock); } void nfc_llcp_socket_remote_param_init(struct nfc_llcp_sock *sock) { sock->remote_rw = LLCP_DEFAULT_RW; sock->remote_miu = LLCP_MAX_MIU + 1; } static void nfc_llcp_socket_purge(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local = sock->local; struct sk_buff *s, *tmp; skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); if (local == NULL) return; /* Search for local pending SKBs that are related to this socket */ skb_queue_walk_safe(&local->tx_queue, s, tmp) { if (s->sk != &sock->sk) continue; skb_unlink(s, &local->tx_queue); kfree_skb(s); } } static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool device, int err) { struct sock *sk; struct hlist_node *tmp; struct nfc_llcp_sock *llcp_sock; skb_queue_purge(&local->tx_queue); write_lock(&local->sockets.lock); sk_for_each_safe(sk, tmp, &local->sockets.head) { llcp_sock = nfc_llcp_sock(sk); bh_lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (sk->sk_state == LLCP_CONNECTED) nfc_put_device(llcp_sock->dev); if (sk->sk_state == LLCP_LISTEN) { struct nfc_llcp_sock *lsk, *n; struct sock *accept_sk; list_for_each_entry_safe(lsk, n, &llcp_sock->accept_queue, accept_queue) { accept_sk = &lsk->sk; bh_lock_sock(accept_sk); nfc_llcp_accept_unlink(accept_sk); if (err) accept_sk->sk_err = err; accept_sk->sk_state = LLCP_CLOSED; accept_sk->sk_state_change(sk); bh_unlock_sock(accept_sk); } } if (err) sk->sk_err = err; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); bh_unlock_sock(sk); sk_del_node_init(sk); } write_unlock(&local->sockets.lock); /* If we still have a device, we keep the RAW sockets alive */ if (device == true) return; write_lock(&local->raw_sockets.lock); sk_for_each_safe(sk, tmp, &local->raw_sockets.head) { llcp_sock = nfc_llcp_sock(sk); bh_lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (err) sk->sk_err = err; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); bh_unlock_sock(sk); sk_del_node_init(sk); } write_unlock(&local->raw_sockets.lock); } static struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) { /* Since using nfc_llcp_local may result in usage of nfc_dev, whenever * we hold a reference to local, we also need to hold a reference to * the device to avoid UAF. */ if (!nfc_get_device(local->dev->idx)) return NULL; kref_get(&local->ref); return local; } static void local_cleanup(struct nfc_llcp_local *local) { nfc_llcp_socket_release(local, false, ENXIO); del_timer_sync(&local->link_timer); skb_queue_purge(&local->tx_queue); cancel_work_sync(&local->tx_work); cancel_work_sync(&local->rx_work); cancel_work_sync(&local->timeout_work); kfree_skb(local->rx_pending); local->rx_pending = NULL; del_timer_sync(&local->sdreq_timer); cancel_work_sync(&local->sdreq_timeout_work); nfc_llcp_free_sdp_tlv_list(&local->pending_sdreqs); } static void local_release(struct kref *ref) { struct nfc_llcp_local *local; local = container_of(ref, struct nfc_llcp_local, ref); local_cleanup(local); kfree(local); } int nfc_llcp_local_put(struct nfc_llcp_local *local) { struct nfc_dev *dev; int ret; if (local == NULL) return 0; dev = local->dev; ret = kref_put(&local->ref, local_release); nfc_put_device(dev); return ret; } static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, u8 ssap, u8 dsap) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; pr_debug("ssap dsap %d %d\n", ssap, dsap); if (ssap == 0 && dsap == 0) return NULL; read_lock(&local->sockets.lock); llcp_sock = NULL; sk_for_each(sk, &local->sockets.head) { tmp_sock = nfc_llcp_sock(sk); if (tmp_sock->ssap == ssap && tmp_sock->dsap == dsap) { llcp_sock = tmp_sock; sock_hold(&llcp_sock->sk); break; } } read_unlock(&local->sockets.lock); return llcp_sock; } static void nfc_llcp_sock_put(struct nfc_llcp_sock *sock) { sock_put(&sock->sk); } static void nfc_llcp_timeout_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, timeout_work); nfc_dep_link_down(local->dev); } static void nfc_llcp_symm_timer(struct timer_list *t) { struct nfc_llcp_local *local = from_timer(local, t, link_timer); pr_err("SYMM timeout\n"); schedule_work(&local->timeout_work); } static void nfc_llcp_sdreq_timeout_work(struct work_struct *work) { unsigned long time; HLIST_HEAD(nl_sdres_list); struct hlist_node *n; struct nfc_llcp_sdp_tlv *sdp; struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, sdreq_timeout_work); mutex_lock(&local->sdreq_lock); time = jiffies - msecs_to_jiffies(3 * local->remote_lto); hlist_for_each_entry_safe(sdp, n, &local->pending_sdreqs, node) { if (time_after(sdp->time, time)) continue; sdp->sap = LLCP_SDP_UNBOUND; hlist_del(&sdp->node); hlist_add_head(&sdp->node, &nl_sdres_list); } if (!hlist_empty(&local->pending_sdreqs)) mod_timer(&local->sdreq_timer, jiffies + msecs_to_jiffies(3 * local->remote_lto)); mutex_unlock(&local->sdreq_lock); if (!hlist_empty(&nl_sdres_list)) nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); } static void nfc_llcp_sdreq_timer(struct timer_list *t) { struct nfc_llcp_local *local = from_timer(local, t, sdreq_timer); schedule_work(&local->sdreq_timeout_work); } struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev) { struct nfc_llcp_local *local; struct nfc_llcp_local *res = NULL; spin_lock(&llcp_devices_lock); list_for_each_entry(local, &llcp_devices, list) if (local->dev == dev) { res = nfc_llcp_local_get(local); break; } spin_unlock(&llcp_devices_lock); return res; } static struct nfc_llcp_local *nfc_llcp_remove_local(struct nfc_dev *dev) { struct nfc_llcp_local *local, *tmp; spin_lock(&llcp_devices_lock); list_for_each_entry_safe(local, tmp, &llcp_devices, list) if (local->dev == dev) { list_del(&local->list); spin_unlock(&llcp_devices_lock); return local; } spin_unlock(&llcp_devices_lock); pr_warn("Shutting down device not found\n"); return NULL; } static char *wks[] = { NULL, NULL, /* SDP */ "urn:nfc:sn:ip", "urn:nfc:sn:obex", "urn:nfc:sn:snep", }; static int nfc_llcp_wks_sap(const char *service_name, size_t service_name_len) { int sap, num_wks; pr_debug("%s\n", service_name); if (service_name == NULL) return -EINVAL; num_wks = ARRAY_SIZE(wks); for (sap = 0; sap < num_wks; sap++) { if (wks[sap] == NULL) continue; if (strncmp(wks[sap], service_name, service_name_len) == 0) return sap; } return -EINVAL; } static struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, const u8 *sn, size_t sn_len, bool needref) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; pr_debug("sn %zd %p\n", sn_len, sn); if (sn == NULL || sn_len == 0) return NULL; read_lock(&local->sockets.lock); llcp_sock = NULL; sk_for_each(sk, &local->sockets.head) { tmp_sock = nfc_llcp_sock(sk); pr_debug("llcp sock %p\n", tmp_sock); if (tmp_sock->sk.sk_type == SOCK_STREAM && tmp_sock->sk.sk_state != LLCP_LISTEN) continue; if (tmp_sock->sk.sk_type == SOCK_DGRAM && tmp_sock->sk.sk_state != LLCP_BOUND) continue; if (tmp_sock->service_name == NULL || tmp_sock->service_name_len == 0) continue; if (tmp_sock->service_name_len != sn_len) continue; if (memcmp(sn, tmp_sock->service_name, sn_len) == 0) { llcp_sock = tmp_sock; if (needref) sock_hold(&llcp_sock->sk); break; } } read_unlock(&local->sockets.lock); pr_debug("Found llcp sock %p\n", llcp_sock); return llcp_sock; } u8 nfc_llcp_get_sdp_ssap(struct nfc_llcp_local *local, struct nfc_llcp_sock *sock) { mutex_lock(&local->sdp_lock); if (sock->service_name != NULL && sock->service_name_len > 0) { int ssap = nfc_llcp_wks_sap(sock->service_name, sock->service_name_len); if (ssap > 0) { pr_debug("WKS %d\n", ssap); /* This is a WKS, let's check if it's free */ if (test_bit(ssap, &local->local_wks)) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } set_bit(ssap, &local->local_wks); mutex_unlock(&local->sdp_lock); return ssap; } /* * Check if there already is a non WKS socket bound * to this service name. */ if (nfc_llcp_sock_from_sn(local, sock->service_name, sock->service_name_len, false) != NULL) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } mutex_unlock(&local->sdp_lock); return LLCP_SDP_UNBOUND; } else if (sock->ssap != 0 && sock->ssap < LLCP_WKS_NUM_SAP) { if (!test_bit(sock->ssap, &local->local_wks)) { set_bit(sock->ssap, &local->local_wks); mutex_unlock(&local->sdp_lock); return sock->ssap; } } mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } u8 nfc_llcp_get_local_ssap(struct nfc_llcp_local *local) { u8 local_ssap; mutex_lock(&local->sdp_lock); local_ssap = find_first_zero_bit(&local->local_sap, LLCP_LOCAL_NUM_SAP); if (local_ssap == LLCP_LOCAL_NUM_SAP) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } set_bit(local_ssap, &local->local_sap); mutex_unlock(&local->sdp_lock); return local_ssap + LLCP_LOCAL_SAP_OFFSET; } void nfc_llcp_put_ssap(struct nfc_llcp_local *local, u8 ssap) { u8 local_ssap; unsigned long *sdp; if (ssap < LLCP_WKS_NUM_SAP) { local_ssap = ssap; sdp = &local->local_wks; } else if (ssap < LLCP_LOCAL_NUM_SAP) { atomic_t *client_cnt; local_ssap = ssap - LLCP_WKS_NUM_SAP; sdp = &local->local_sdp; client_cnt = &local->local_sdp_cnt[local_ssap]; pr_debug("%d clients\n", atomic_read(client_cnt)); mutex_lock(&local->sdp_lock); if (atomic_dec_and_test(client_cnt)) { struct nfc_llcp_sock *l_sock; pr_debug("No more clients for SAP %d\n", ssap); clear_bit(local_ssap, sdp); /* Find the listening sock and set it back to UNBOUND */ l_sock = nfc_llcp_sock_get(local, ssap, LLCP_SAP_SDP); if (l_sock) { l_sock->ssap = LLCP_SDP_UNBOUND; nfc_llcp_sock_put(l_sock); } } mutex_unlock(&local->sdp_lock); return; } else if (ssap < LLCP_MAX_SAP) { local_ssap = ssap - LLCP_LOCAL_NUM_SAP; sdp = &local->local_sap; } else { return; } mutex_lock(&local->sdp_lock); clear_bit(local_ssap, sdp); mutex_unlock(&local->sdp_lock); } static u8 nfc_llcp_reserve_sdp_ssap(struct nfc_llcp_local *local) { u8 ssap; mutex_lock(&local->sdp_lock); ssap = find_first_zero_bit(&local->local_sdp, LLCP_SDP_NUM_SAP); if (ssap == LLCP_SDP_NUM_SAP) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } pr_debug("SDP ssap %d\n", LLCP_WKS_NUM_SAP + ssap); set_bit(ssap, &local->local_sdp); mutex_unlock(&local->sdp_lock); return LLCP_WKS_NUM_SAP + ssap; } static int nfc_llcp_build_gb(struct nfc_llcp_local *local) { u8 *gb_cur, version, version_length; u8 lto_length, wks_length, miux_length; const u8 *version_tlv = NULL, *lto_tlv = NULL, *wks_tlv = NULL, *miux_tlv = NULL; __be16 wks = cpu_to_be16(local->local_wks); u8 gb_len = 0; int ret = 0; version = LLCP_VERSION_11; version_tlv = nfc_llcp_build_tlv(LLCP_TLV_VERSION, &version, 1, &version_length); if (!version_tlv) { ret = -ENOMEM; goto out; } gb_len += version_length; lto_tlv = nfc_llcp_build_tlv(LLCP_TLV_LTO, &local->lto, 1, <o_length); if (!lto_tlv) { ret = -ENOMEM; goto out; } gb_len += lto_length; pr_debug("Local wks 0x%lx\n", local->local_wks); wks_tlv = nfc_llcp_build_tlv(LLCP_TLV_WKS, (u8 *)&wks, 2, &wks_length); if (!wks_tlv) { ret = -ENOMEM; goto out; } gb_len += wks_length; miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&local->miux, 0, &miux_length); if (!miux_tlv) { ret = -ENOMEM; goto out; } gb_len += miux_length; gb_len += ARRAY_SIZE(llcp_magic); if (gb_len > NFC_MAX_GT_LEN) { ret = -EINVAL; goto out; } gb_cur = local->gb; memcpy(gb_cur, llcp_magic, ARRAY_SIZE(llcp_magic)); gb_cur += ARRAY_SIZE(llcp_magic); memcpy(gb_cur, version_tlv, version_length); gb_cur += version_length; memcpy(gb_cur, lto_tlv, lto_length); gb_cur += lto_length; memcpy(gb_cur, wks_tlv, wks_length); gb_cur += wks_length; memcpy(gb_cur, miux_tlv, miux_length); gb_cur += miux_length; local->gb_len = gb_len; out: kfree(version_tlv); kfree(lto_tlv); kfree(wks_tlv); kfree(miux_tlv); return ret; } u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) { *general_bytes_len = 0; return NULL; } nfc_llcp_build_gb(local); *general_bytes_len = local->gb_len; nfc_llcp_local_put(local); return local->gb; } int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { struct nfc_llcp_local *local; int err; if (gb_len < 3 || gb_len > NFC_MAX_GT_LEN) return -EINVAL; local = nfc_llcp_find_local(dev); if (local == NULL) { pr_err("No LLCP device\n"); return -ENODEV; } memset(local->remote_gb, 0, NFC_MAX_GT_LEN); memcpy(local->remote_gb, gb, gb_len); local->remote_gb_len = gb_len; if (memcmp(local->remote_gb, llcp_magic, 3)) { pr_err("MAC does not support LLCP\n"); err = -EINVAL; goto out; } err = nfc_llcp_parse_gb_tlv(local, &local->remote_gb[3], local->remote_gb_len - 3); out: nfc_llcp_local_put(local); return err; } static u8 nfc_llcp_dsap(const struct sk_buff *pdu) { return (pdu->data[0] & 0xfc) >> 2; } static u8 nfc_llcp_ptype(const struct sk_buff *pdu) { return ((pdu->data[0] & 0x03) << 2) | ((pdu->data[1] & 0xc0) >> 6); } static u8 nfc_llcp_ssap(const struct sk_buff *pdu) { return pdu->data[1] & 0x3f; } static u8 nfc_llcp_ns(const struct sk_buff *pdu) { return pdu->data[2] >> 4; } static u8 nfc_llcp_nr(const struct sk_buff *pdu) { return pdu->data[2] & 0xf; } static void nfc_llcp_set_nrns(struct nfc_llcp_sock *sock, struct sk_buff *pdu) { pdu->data[2] = (sock->send_n << 4) | (sock->recv_n); sock->send_n = (sock->send_n + 1) % 16; sock->recv_ack_n = (sock->recv_n - 1) % 16; } void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local, struct sk_buff *skb, u8 direction) { struct sk_buff *skb_copy = NULL, *nskb; struct sock *sk; u8 *data; read_lock(&local->raw_sockets.lock); sk_for_each(sk, &local->raw_sockets.head) { if (sk->sk_state != LLCP_BOUND) continue; if (skb_copy == NULL) { skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE, GFP_ATOMIC, true); if (skb_copy == NULL) continue; data = skb_push(skb_copy, NFC_RAW_HEADER_SIZE); data[0] = local->dev ? local->dev->idx : 0xFF; data[1] = direction & 0x01; data[1] |= (RAW_PAYLOAD_LLCP << 1); } nskb = skb_clone(skb_copy, GFP_ATOMIC); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } read_unlock(&local->raw_sockets.lock); kfree_skb(skb_copy); } static void nfc_llcp_tx_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, tx_work); struct sk_buff *skb; struct sock *sk; struct nfc_llcp_sock *llcp_sock; skb = skb_dequeue(&local->tx_queue); if (skb != NULL) { sk = skb->sk; llcp_sock = nfc_llcp_sock(sk); if (llcp_sock == NULL && nfc_llcp_ptype(skb) == LLCP_PDU_I) { kfree_skb(skb); nfc_llcp_send_symm(local->dev); } else if (llcp_sock && !llcp_sock->remote_ready) { skb_queue_head(&local->tx_queue, skb); nfc_llcp_send_symm(local->dev); } else { struct sk_buff *copy_skb = NULL; u8 ptype = nfc_llcp_ptype(skb); int ret; pr_debug("Sending pending skb\n"); print_hex_dump_debug("LLCP Tx: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, skb->len, true); if (ptype == LLCP_PDU_I) copy_skb = skb_copy(skb, GFP_ATOMIC); __net_timestamp(skb); nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_TX); ret = nfc_data_exchange(local->dev, local->target_idx, skb, nfc_llcp_recv, local); if (ret) { kfree_skb(copy_skb); goto out; } if (ptype == LLCP_PDU_I && copy_skb) skb_queue_tail(&llcp_sock->tx_pending_queue, copy_skb); } } else { nfc_llcp_send_symm(local->dev); } out: mod_timer(&local->link_timer, jiffies + msecs_to_jiffies(2 * local->remote_lto)); } static struct nfc_llcp_sock *nfc_llcp_connecting_sock_get(struct nfc_llcp_local *local, u8 ssap) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; read_lock(&local->connecting_sockets.lock); sk_for_each(sk, &local->connecting_sockets.head) { llcp_sock = nfc_llcp_sock(sk); if (llcp_sock->ssap == ssap) { sock_hold(&llcp_sock->sk); goto out; } } llcp_sock = NULL; out: read_unlock(&local->connecting_sockets.lock); return llcp_sock; } static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, const u8 *sn, size_t sn_len) { return nfc_llcp_sock_from_sn(local, sn, sn_len, true); } static const u8 *nfc_llcp_connect_sn(const struct sk_buff *skb, size_t *sn_len) { u8 type, length; const u8 *tlv = &skb->data[2]; size_t tlv_array_len = skb->len - LLCP_HEADER_SIZE, offset = 0; while (offset < tlv_array_len) { type = tlv[0]; length = tlv[1]; pr_debug("type 0x%x length %d\n", type, length); if (type == LLCP_TLV_SN) { *sn_len = length; return &tlv[2]; } offset += length + 2; tlv += length + 2; } return NULL; } static void nfc_llcp_recv_ui(struct nfc_llcp_local *local, struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct nfc_llcp_ui_cb *ui_cb; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); ui_cb = nfc_llcp_ui_skb_cb(skb); ui_cb->dsap = dsap; ui_cb->ssap = ssap; pr_debug("%d %d\n", dsap, ssap); /* We're looking for a bound socket, not a client one */ llcp_sock = nfc_llcp_sock_get(local, dsap, LLCP_SAP_SDP); if (llcp_sock == NULL || llcp_sock->sk.sk_type != SOCK_DGRAM) return; /* There is no sequence with UI frames */ skb_pull(skb, LLCP_HEADER_SIZE); if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) { /* * UI frames will be freed from the socket layer, so we * need to keep them alive until someone receives them. */ skb_get(skb); } else { pr_err("Receive queue is full\n"); } nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct sock *new_sk, *parent; struct nfc_llcp_sock *sock, *new_sock; u8 dsap, ssap, reason; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("%d %d\n", dsap, ssap); if (dsap != LLCP_SAP_SDP) { sock = nfc_llcp_sock_get(local, dsap, LLCP_SAP_SDP); if (sock == NULL || sock->sk.sk_state != LLCP_LISTEN) { reason = LLCP_DM_NOBOUND; goto fail; } } else { const u8 *sn; size_t sn_len; sn = nfc_llcp_connect_sn(skb, &sn_len); if (sn == NULL) { reason = LLCP_DM_NOBOUND; goto fail; } pr_debug("Service name length %zu\n", sn_len); sock = nfc_llcp_sock_get_sn(local, sn, sn_len); if (sock == NULL) { reason = LLCP_DM_NOBOUND; goto fail; } } lock_sock(&sock->sk); parent = &sock->sk; if (sk_acceptq_is_full(parent)) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } if (sock->ssap == LLCP_SDP_UNBOUND) { u8 ssap = nfc_llcp_reserve_sdp_ssap(local); pr_debug("First client, reserving %d\n", ssap); if (ssap == LLCP_SAP_MAX) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } sock->ssap = ssap; } new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0); if (new_sk == NULL) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } new_sock = nfc_llcp_sock(new_sk); new_sock->local = nfc_llcp_local_get(local); if (!new_sock->local) { reason = LLCP_DM_REJ; sock_put(&new_sock->sk); release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } new_sock->dev = local->dev; new_sock->rw = sock->rw; new_sock->miux = sock->miux; new_sock->nfc_protocol = sock->nfc_protocol; new_sock->dsap = ssap; new_sock->target_idx = local->target_idx; new_sock->parent = parent; new_sock->ssap = sock->ssap; if (sock->ssap < LLCP_LOCAL_NUM_SAP && sock->ssap >= LLCP_WKS_NUM_SAP) { atomic_t *client_count; pr_debug("reserved_ssap %d for %p\n", sock->ssap, new_sock); client_count = &local->local_sdp_cnt[sock->ssap - LLCP_WKS_NUM_SAP]; atomic_inc(client_count); new_sock->reserved_ssap = sock->ssap; } nfc_llcp_parse_connection_tlv(new_sock, &skb->data[LLCP_HEADER_SIZE], skb->len - LLCP_HEADER_SIZE); pr_debug("new sock %p sk %p\n", new_sock, &new_sock->sk); nfc_llcp_sock_link(&local->sockets, new_sk); nfc_llcp_accept_enqueue(&sock->sk, new_sk); nfc_get_device(local->dev->idx); new_sk->sk_state = LLCP_CONNECTED; /* Wake the listening processes */ parent->sk_data_ready(parent); /* Send CC */ nfc_llcp_send_cc(new_sock); release_sock(&sock->sk); sock_put(&sock->sk); return; fail: /* Send DM */ nfc_llcp_send_dm(local, dsap, ssap, reason); } int nfc_llcp_queue_i_frames(struct nfc_llcp_sock *sock) { int nr_frames = 0; struct nfc_llcp_local *local = sock->local; pr_debug("Remote ready %d tx queue len %d remote rw %d", sock->remote_ready, skb_queue_len(&sock->tx_pending_queue), sock->remote_rw); /* Try to queue some I frames for transmission */ while (sock->remote_ready && skb_queue_len(&sock->tx_pending_queue) < sock->remote_rw) { struct sk_buff *pdu; pdu = skb_dequeue(&sock->tx_queue); if (pdu == NULL) break; /* Update N(S)/N(R) */ nfc_llcp_set_nrns(sock, pdu); skb_queue_tail(&local->tx_queue, pdu); nr_frames++; } return nr_frames; } static void nfc_llcp_recv_hdlc(struct nfc_llcp_local *local, struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap, ptype, ns, nr; ptype = nfc_llcp_ptype(skb); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); ns = nfc_llcp_ns(skb); nr = nfc_llcp_nr(skb); pr_debug("%d %d R %d S %d\n", dsap, ssap, nr, ns); llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); if (llcp_sock == NULL) { nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; lock_sock(sk); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_sock_put(llcp_sock); } /* Pass the payload upstream */ if (ptype == LLCP_PDU_I) { pr_debug("I frame, queueing on %p\n", &llcp_sock->sk); if (ns == llcp_sock->recv_n) llcp_sock->recv_n = (llcp_sock->recv_n + 1) % 16; else pr_err("Received out of sequence I PDU\n"); skb_pull(skb, LLCP_HEADER_SIZE + LLCP_SEQUENCE_SIZE); if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) { /* * I frames will be freed from the socket layer, so we * need to keep them alive until someone receives them. */ skb_get(skb); } else { pr_err("Receive queue is full\n"); } } /* Remove skbs from the pending queue */ if (llcp_sock->send_ack_n != nr) { struct sk_buff *s, *tmp; u8 n; llcp_sock->send_ack_n = nr; /* Remove and free all skbs until ns == nr */ skb_queue_walk_safe(&llcp_sock->tx_pending_queue, s, tmp) { n = nfc_llcp_ns(s); skb_unlink(s, &llcp_sock->tx_pending_queue); kfree_skb(s); if (n == nr) break; } /* Re-queue the remaining skbs for transmission */ skb_queue_reverse_walk_safe(&llcp_sock->tx_pending_queue, s, tmp) { skb_unlink(s, &llcp_sock->tx_pending_queue); skb_queue_head(&local->tx_queue, s); } } if (ptype == LLCP_PDU_RR) llcp_sock->remote_ready = true; else if (ptype == LLCP_PDU_RNR) llcp_sock->remote_ready = false; if (nfc_llcp_queue_i_frames(llcp_sock) == 0 && ptype == LLCP_PDU_I) nfc_llcp_send_rr(llcp_sock); release_sock(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); if ((dsap == 0) && (ssap == 0)) { pr_debug("Connection termination"); nfc_dep_link_down(local->dev); return; } llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); if (llcp_sock == NULL) { nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_sock_put(llcp_sock); } if (sk->sk_state == LLCP_CONNECTED) { nfc_put_device(local->dev); sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); } nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_DISC); release_sock(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); llcp_sock = nfc_llcp_connecting_sock_get(local, dsap); if (llcp_sock == NULL) { pr_err("Invalid CC\n"); nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; /* Unlink from connecting and link to the client array */ nfc_llcp_sock_unlink(&local->connecting_sockets, sk); nfc_llcp_sock_link(&local->sockets, sk); llcp_sock->dsap = ssap; nfc_llcp_parse_connection_tlv(llcp_sock, &skb->data[LLCP_HEADER_SIZE], skb->len - LLCP_HEADER_SIZE); sk->sk_state = LLCP_CONNECTED; sk->sk_state_change(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap, reason; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); reason = skb->data[2]; pr_debug("%d %d reason %d\n", ssap, dsap, reason); switch (reason) { case LLCP_DM_NOBOUND: case LLCP_DM_REJ: llcp_sock = nfc_llcp_connecting_sock_get(local, dsap); break; default: llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); break; } if (llcp_sock == NULL) { pr_debug("Already closed\n"); return; } sk = &llcp_sock->sk; sk->sk_err = ENXIO; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; u8 dsap, ssap, type, length, tid, sap; const u8 *tlv; u16 tlv_len, offset; const char *service_name; size_t service_name_len; struct nfc_llcp_sdp_tlv *sdp; HLIST_HEAD(llc_sdres_list); size_t sdres_tlvs_len; HLIST_HEAD(nl_sdres_list); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("%d %d\n", dsap, ssap); if (dsap != LLCP_SAP_SDP || ssap != LLCP_SAP_SDP) { pr_err("Wrong SNL SAP\n"); return; } tlv = &skb->data[LLCP_HEADER_SIZE]; tlv_len = skb->len - LLCP_HEADER_SIZE; offset = 0; sdres_tlvs_len = 0; while (offset < tlv_len) { type = tlv[0]; length = tlv[1]; switch (type) { case LLCP_TLV_SDREQ: tid = tlv[2]; service_name = (char *) &tlv[3]; service_name_len = length - 1; pr_debug("Looking for %.16s\n", service_name); if (service_name_len == strlen("urn:nfc:sn:sdp") && !strncmp(service_name, "urn:nfc:sn:sdp", service_name_len)) { sap = 1; goto add_snl; } llcp_sock = nfc_llcp_sock_from_sn(local, service_name, service_name_len, true); if (!llcp_sock) { sap = 0; goto add_snl; } /* * We found a socket but its ssap has not been reserved * yet. We need to assign it for good and send a reply. * The ssap will be freed when the socket is closed. */ if (llcp_sock->ssap == LLCP_SDP_UNBOUND) { atomic_t *client_count; sap = nfc_llcp_reserve_sdp_ssap(local); pr_debug("Reserving %d\n", sap); if (sap == LLCP_SAP_MAX) { sap = 0; nfc_llcp_sock_put(llcp_sock); goto add_snl; } client_count = &local->local_sdp_cnt[sap - LLCP_WKS_NUM_SAP]; atomic_inc(client_count); llcp_sock->ssap = sap; llcp_sock->reserved_ssap = sap; } else { sap = llcp_sock->ssap; } pr_debug("%p %d\n", llcp_sock, sap); nfc_llcp_sock_put(llcp_sock); add_snl: sdp = nfc_llcp_build_sdres_tlv(tid, sap); if (sdp == NULL) goto exit; sdres_tlvs_len += sdp->tlv_len; hlist_add_head(&sdp->node, &llc_sdres_list); break; case LLCP_TLV_SDRES: mutex_lock(&local->sdreq_lock); pr_debug("LLCP_TLV_SDRES: searching tid %d\n", tlv[2]); hlist_for_each_entry(sdp, &local->pending_sdreqs, node) { if (sdp->tid != tlv[2]) continue; sdp->sap = tlv[3]; pr_debug("Found: uri=%s, sap=%d\n", sdp->uri, sdp->sap); hlist_del(&sdp->node); hlist_add_head(&sdp->node, &nl_sdres_list); break; } mutex_unlock(&local->sdreq_lock); break; default: pr_err("Invalid SNL tlv value 0x%x\n", type); break; } offset += length + 2; tlv += length + 2; } exit: if (!hlist_empty(&nl_sdres_list)) nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); if (!hlist_empty(&llc_sdres_list)) nfc_llcp_send_snl_sdres(local, &llc_sdres_list, sdres_tlvs_len); } static void nfc_llcp_recv_agf(struct nfc_llcp_local *local, struct sk_buff *skb) { u8 ptype; u16 pdu_len; struct sk_buff *new_skb; if (skb->len <= LLCP_HEADER_SIZE) { pr_err("Malformed AGF PDU\n"); return; } skb_pull(skb, LLCP_HEADER_SIZE); while (skb->len > LLCP_AGF_PDU_HEADER_SIZE) { pdu_len = skb->data[0] << 8 | skb->data[1]; skb_pull(skb, LLCP_AGF_PDU_HEADER_SIZE); if (pdu_len < LLCP_HEADER_SIZE || pdu_len > skb->len) { pr_err("Malformed AGF PDU\n"); return; } ptype = nfc_llcp_ptype(skb); if (ptype == LLCP_PDU_SYMM || ptype == LLCP_PDU_AGF) goto next; new_skb = nfc_alloc_recv_skb(pdu_len, GFP_KERNEL); if (new_skb == NULL) { pr_err("Could not allocate PDU\n"); return; } skb_put_data(new_skb, skb->data, pdu_len); nfc_llcp_rx_skb(local, new_skb); kfree_skb(new_skb); next: skb_pull(skb, pdu_len); } } static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb) { u8 dsap, ssap, ptype; ptype = nfc_llcp_ptype(skb); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("ptype 0x%x dsap 0x%x ssap 0x%x\n", ptype, dsap, ssap); if (ptype != LLCP_PDU_SYMM) print_hex_dump_debug("LLCP Rx: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, skb->len, true); switch (ptype) { case LLCP_PDU_SYMM: pr_debug("SYMM\n"); break; case LLCP_PDU_UI: pr_debug("UI\n"); nfc_llcp_recv_ui(local, skb); break; case LLCP_PDU_CONNECT: pr_debug("CONNECT\n"); nfc_llcp_recv_connect(local, skb); break; case LLCP_PDU_DISC: pr_debug("DISC\n"); nfc_llcp_recv_disc(local, skb); break; case LLCP_PDU_CC: pr_debug("CC\n"); nfc_llcp_recv_cc(local, skb); break; case LLCP_PDU_DM: pr_debug("DM\n"); nfc_llcp_recv_dm(local, skb); break; case LLCP_PDU_SNL: pr_debug("SNL\n"); nfc_llcp_recv_snl(local, skb); break; case LLCP_PDU_I: case LLCP_PDU_RR: case LLCP_PDU_RNR: pr_debug("I frame\n"); nfc_llcp_recv_hdlc(local, skb); break; case LLCP_PDU_AGF: pr_debug("AGF frame\n"); nfc_llcp_recv_agf(local, skb); break; } } static void nfc_llcp_rx_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, rx_work); struct sk_buff *skb; skb = local->rx_pending; if (skb == NULL) { pr_debug("No pending SKB\n"); return; } __net_timestamp(skb); nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_RX); nfc_llcp_rx_skb(local, skb); schedule_work(&local->tx_work); kfree_skb(local->rx_pending); local->rx_pending = NULL; } static void __nfc_llcp_recv(struct nfc_llcp_local *local, struct sk_buff *skb) { local->rx_pending = skb; del_timer(&local->link_timer); schedule_work(&local->rx_work); } void nfc_llcp_recv(void *data, struct sk_buff *skb, int err) { struct nfc_llcp_local *local = (struct nfc_llcp_local *) data; if (err < 0) { pr_err("LLCP PDU receive err %d\n", err); return; } __nfc_llcp_recv(local, skb); } int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) { kfree_skb(skb); return -ENODEV; } __nfc_llcp_recv(local, skb); nfc_llcp_local_put(local); return 0; } void nfc_llcp_mac_is_down(struct nfc_dev *dev) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) return; local->remote_miu = LLCP_DEFAULT_MIU; local->remote_lto = LLCP_DEFAULT_LTO; /* Close and purge all existing sockets */ nfc_llcp_socket_release(local, true, 0); nfc_llcp_local_put(local); } void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { struct nfc_llcp_local *local; pr_debug("rf mode %d\n", rf_mode); local = nfc_llcp_find_local(dev); if (local == NULL) return; local->target_idx = target_idx; local->comm_mode = comm_mode; local->rf_mode = rf_mode; if (rf_mode == NFC_RF_INITIATOR) { pr_debug("Queueing Tx work\n"); schedule_work(&local->tx_work); } else { mod_timer(&local->link_timer, jiffies + msecs_to_jiffies(local->remote_lto)); } nfc_llcp_local_put(local); } int nfc_llcp_register_device(struct nfc_dev *ndev) { struct nfc_llcp_local *local; local = kzalloc(sizeof(struct nfc_llcp_local), GFP_KERNEL); if (local == NULL) return -ENOMEM; /* As we are going to initialize local's refcount, we need to get the * nfc_dev to avoid UAF, otherwise there is no point in continuing. * See nfc_llcp_local_get(). */ local->dev = nfc_get_device(ndev->idx); if (!local->dev) { kfree(local); return -ENODEV; } INIT_LIST_HEAD(&local->list); kref_init(&local->ref); mutex_init(&local->sdp_lock); timer_setup(&local->link_timer, nfc_llcp_symm_timer, 0); skb_queue_head_init(&local->tx_queue); INIT_WORK(&local->tx_work, nfc_llcp_tx_work); local->rx_pending = NULL; INIT_WORK(&local->rx_work, nfc_llcp_rx_work); INIT_WORK(&local->timeout_work, nfc_llcp_timeout_work); rwlock_init(&local->sockets.lock); rwlock_init(&local->connecting_sockets.lock); rwlock_init(&local->raw_sockets.lock); local->lto = 150; /* 1500 ms */ local->rw = LLCP_MAX_RW; local->miux = cpu_to_be16(LLCP_MAX_MIUX); local->local_wks = 0x1; /* LLC Link Management */ nfc_llcp_build_gb(local); local->remote_miu = LLCP_DEFAULT_MIU; local->remote_lto = LLCP_DEFAULT_LTO; mutex_init(&local->sdreq_lock); INIT_HLIST_HEAD(&local->pending_sdreqs); timer_setup(&local->sdreq_timer, nfc_llcp_sdreq_timer, 0); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); spin_lock(&llcp_devices_lock); list_add(&local->list, &llcp_devices); spin_unlock(&llcp_devices_lock); return 0; } void nfc_llcp_unregister_device(struct nfc_dev *dev) { struct nfc_llcp_local *local = nfc_llcp_remove_local(dev); if (local == NULL) { pr_debug("No such device\n"); return; } local_cleanup(local); nfc_llcp_local_put(local); } int __init nfc_llcp_init(void) { return nfc_llcp_sock_init(); } void nfc_llcp_exit(void) { nfc_llcp_sock_exit(); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cma #if !defined(_TRACE_CMA_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CMA_H #include <linux/types.h> #include <linux/tracepoint.h> TRACE_EVENT(cma_release, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned long count), TP_ARGS(name, pfn, page, count), TP_STRUCT__entry( __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned long, count) ), TP_fast_assign( __assign_str(name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; ), TP_printk("name=%s pfn=0x%lx page=%p count=%lu", __get_str(name), __entry->pfn, __entry->page, __entry->count) ); TRACE_EVENT(cma_alloc_start, TP_PROTO(const char *name, unsigned long count, unsigned int align), TP_ARGS(name, count, align), TP_STRUCT__entry( __string(name, name) __field(unsigned long, count) __field(unsigned int, align) ), TP_fast_assign( __assign_str(name); __entry->count = count; __entry->align = align; ), TP_printk("name=%s count=%lu align=%u", __get_str(name), __entry->count, __entry->align) ); TRACE_EVENT(cma_alloc_finish, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned long count, unsigned int align, int errorno), TP_ARGS(name, pfn, page, count, align, errorno), TP_STRUCT__entry( __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned long, count) __field(unsigned int, align) __field(int, errorno) ), TP_fast_assign( __assign_str(name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; __entry->align = align; __entry->errorno = errorno; ), TP_printk("name=%s pfn=0x%lx page=%p count=%lu align=%u errorno=%d", __get_str(name), __entry->pfn, __entry->page, __entry->count, __entry->align, __entry->errorno) ); TRACE_EVENT(cma_alloc_busy_retry, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align), TP_STRUCT__entry( __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned long, count) __field(unsigned int, align) ), TP_fast_assign( __assign_str(name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; __entry->align = align; ), TP_printk("name=%s pfn=0x%lx page=%p count=%lu align=%u", __get_str(name), __entry->pfn, __entry->page, __entry->count, __entry->align) ); #endif /* _TRACE_CMA_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
816 471 471 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM csd #if !defined(_TRACE_CSD_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CSD_H #include <linux/tracepoint.h> TRACE_EVENT(csd_queue_cpu, TP_PROTO(const unsigned int cpu, unsigned long callsite, smp_call_func_t func, call_single_data_t *csd), TP_ARGS(cpu, callsite, func, csd), TP_STRUCT__entry( __field(unsigned int, cpu) __field(void *, callsite) __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->cpu = cpu; __entry->callsite = (void *)callsite; __entry->func = func; __entry->csd = csd; ), TP_printk("cpu=%u callsite=%pS func=%ps csd=%p", __entry->cpu, __entry->callsite, __entry->func, __entry->csd) ); /* * Tracepoints for a function which is called as an effect of smp_call_function.* */ DECLARE_EVENT_CLASS(csd_function, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd), TP_STRUCT__entry( __field(void *, func) __field(void *, csd) ), TP_fast_assign( __entry->func = func; __entry->csd = csd; ), TP_printk("func=%ps, csd=%p", __entry->func, __entry->csd) ); DEFINE_EVENT(csd_function, csd_function_entry, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); DEFINE_EVENT(csd_function, csd_function_exit, TP_PROTO(smp_call_func_t func, call_single_data_t *csd), TP_ARGS(func, csd) ); #endif /* _TRACE_CSD_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS: Never Queue scheduling module * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: */ /* * The NQ algorithm adopts a two-speed model. When there is an idle server * available, the job will be sent to the idle server, instead of waiting * for a fast one. When there is no idle server available, the job will be * sent to the server that minimize its expected delay (The Shortest * Expected Delay scheduling algorithm). * * See the following paper for more information: * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, * pages 986-994, 1988. * * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me. * * The difference between NQ and SED is that NQ can improve overall * system utilization. * */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <net/ip_vs.h> static inline int ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) { /* * We only use the active connection number in the cost * calculation here. */ return atomic_read(&dest->activeconns) + 1; } /* * Weighted Least Connection scheduling */ static struct ip_vs_dest * ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph) { struct ip_vs_dest *dest, *least = NULL; int loh = 0, doh; IP_VS_DBG(6, "%s(): Scheduling...\n", __func__); /* * We calculate the load of each dest server as follows: * (server expected overhead) / dest->weight * * Remember -- no floats in kernel mode!!! * The comparison of h1*w2 > h2*w1 is equivalent to that of * h1/w1 > h2/w2 * if every weight is larger than zero. * * The server with weight=0 is quiesced and will not receive any * new connections. */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD || !atomic_read(&dest->weight)) continue; doh = ip_vs_nq_dest_overhead(dest); /* return the server directly if it is idle */ if (atomic_read(&dest->activeconns) == 0) { least = dest; loh = doh; goto out; } if (!least || ((__s64)loh * atomic_read(&dest->weight) > (__s64)doh * atomic_read(&least->weight))) { least = dest; loh = doh; } } if (!least) { ip_vs_scheduler_err(svc, "no destination available"); return NULL; } out: IP_VS_DBG_BUF(6, "NQ: server %s:%u " "activeconns %d refcnt %d weight %d overhead %d\n", IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; } static struct ip_vs_scheduler ip_vs_nq_scheduler = { .name = "nq", .refcnt = ATOMIC_INIT(0), .module = THIS_MODULE, .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), .schedule = ip_vs_nq_schedule, }; static int __init ip_vs_nq_init(void) { return register_ip_vs_scheduler(&ip_vs_nq_scheduler); } static void __exit ip_vs_nq_cleanup(void) { unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); synchronize_rcu(); } module_init(ip_vs_nq_init); module_exit(ip_vs_nq_cleanup); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ipvs never queue scheduler"); |
86 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corporation, 2021 * * Author: Mike Rapoport <rppt@linux.ibm.com> */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/swap.h> #include <linux/mount.h> #include <linux/memfd.h> #include <linux/bitops.h> #include <linux/printk.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/pseudo_fs.h> #include <linux/secretmem.h> #include <linux/set_memory.h> #include <linux/sched/signal.h> #include <uapi/linux/magic.h> #include <asm/tlbflush.h> #include "internal.h" #undef pr_fmt #define pr_fmt(fmt) "secretmem: " fmt /* * Define mode and flag masks to allow validation of the system call * parameters. */ #define SECRETMEM_MODE_MASK (0x0) #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK static bool secretmem_enable __ro_after_init = 1; module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); static atomic_t secretmem_users; bool secretmem_active(void) { return !!atomic_read(&secretmem_users); } static vm_fault_t secretmem_fault(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct inode *inode = file_inode(vmf->vma->vm_file); pgoff_t offset = vmf->pgoff; gfp_t gfp = vmf->gfp_mask; unsigned long addr; struct page *page; struct folio *folio; vm_fault_t ret; int err; if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return vmf_error(-EINVAL); filemap_invalidate_lock_shared(mapping); retry: page = find_lock_page(mapping, offset); if (!page) { folio = folio_alloc(gfp | __GFP_ZERO, 0); if (!folio) { ret = VM_FAULT_OOM; goto out; } page = &folio->page; err = set_direct_map_invalid_noflush(page); if (err) { folio_put(folio); ret = vmf_error(err); goto out; } __folio_mark_uptodate(folio); err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { folio_put(folio); /* * If a split of large page was required, it * already happened when we marked the page invalid * which guarantees that this call won't fail */ set_direct_map_default_noflush(page); if (err == -EEXIST) goto retry; ret = vmf_error(err); goto out; } addr = (unsigned long)page_address(page); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } vmf->page = page; ret = VM_FAULT_LOCKED; out: filemap_invalidate_unlock_shared(mapping); return ret; } static const struct vm_operations_struct secretmem_vm_ops = { .fault = secretmem_fault, }; static int secretmem_release(struct inode *inode, struct file *file) { atomic_dec(&secretmem_users); return 0; } static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) { unsigned long len = vma->vm_end - vma->vm_start; if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) return -EAGAIN; vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); vma->vm_ops = &secretmem_vm_ops; return 0; } bool vma_is_secretmem(struct vm_area_struct *vma) { return vma->vm_ops == &secretmem_vm_ops; } static const struct file_operations secretmem_fops = { .release = secretmem_release, .mmap = secretmem_mmap, }; static int secretmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return -EBUSY; } static void secretmem_free_folio(struct folio *folio) { set_direct_map_default_noflush(&folio->page); folio_zero_segment(folio, 0, folio_size(folio)); } const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, .free_folio = secretmem_free_folio, .migrate_folio = secretmem_migrate_folio, }; static int secretmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct address_space *mapping = inode->i_mapping; unsigned int ia_valid = iattr->ia_valid; int ret; filemap_invalidate_lock(mapping); if ((ia_valid & ATTR_SIZE) && inode->i_size) ret = -EINVAL; else ret = simple_setattr(idmap, dentry, iattr); filemap_invalidate_unlock(mapping); return ret; } static const struct inode_operations secretmem_iops = { .setattr = secretmem_setattr, }; static struct vfsmount *secretmem_mnt; static struct file *secretmem_file_create(unsigned long flags) { struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); int err; inode = alloc_anon_inode(secretmem_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); err = security_inode_init_security_anon(inode, &qname, NULL); if (err) { file = ERR_PTR(err); goto err_free_inode; } file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", O_RDWR, &secretmem_fops); if (IS_ERR(file)) goto err_free_inode; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_op = &secretmem_iops; inode->i_mapping->a_ops = &secretmem_aops; /* pretend we are a normal file with zero size */ inode->i_mode |= S_IFREG; inode->i_size = 0; return file; err_free_inode: iput(inode); return file; } SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) { struct file *file; int fd, err; /* make sure local flags do not confict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); if (!secretmem_enable || !can_set_direct_map()) return -ENOSYS; if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) return -EINVAL; if (atomic_read(&secretmem_users) < 0) return -ENFILE; fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; file = secretmem_file_create(flags); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_put_fd; } file->f_flags |= O_LARGEFILE; atomic_inc(&secretmem_users); fd_install(fd, file); return fd; err_put_fd: put_unused_fd(fd); return err; } static int secretmem_init_fs_context(struct fs_context *fc) { return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; } static struct file_system_type secretmem_fs = { .name = "secretmem", .init_fs_context = secretmem_init_fs_context, .kill_sb = kill_anon_super, }; static int __init secretmem_init(void) { if (!secretmem_enable || !can_set_direct_map()) return 0; secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) return PTR_ERR(secretmem_mnt); /* prevent secretmem mappings from ever getting PROT_EXEC */ secretmem_mnt->mnt_flags |= MNT_NOEXEC; return 0; } fs_initcall(secretmem_init); |
39 39 39 39 37 4 2 4 1 3 30 27 30 5 30 26 34 29 26 13 13 13 13 4 34 34 34 34 39 39 13 13 47 46 34 39 39 23 37 3 2 1 3 1 2 15 21 6 6 20 20 2 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Spanning tree protocol; generic parts * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/rculist.h> #include <net/switchdev.h> #include "br_private.h" #include "br_private_stp.h" /* since time values in bpdu are in jiffies and then scaled (1/256) * before sending, make sure that is at least one STP tick. */ #define MESSAGE_AGE_INCR ((HZ / 256) + 1) static const char *const br_port_state_names[] = { [BR_STATE_DISABLED] = "disabled", [BR_STATE_LISTENING] = "listening", [BR_STATE_LEARNING] = "learning", [BR_STATE_FORWARDING] = "forwarding", [BR_STATE_BLOCKING] = "blocking", }; void br_set_state(struct net_bridge_port *p, unsigned int state) { struct switchdev_attr attr = { .orig_dev = p->dev, .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE, .flags = SWITCHDEV_F_DEFER, .u.stp_state = state, }; int err; /* Don't change the state of the ports if they are driven by a different * protocol. */ if (p->flags & BR_MRP_AWARE) return; p->state = state; if (br_opt_get(p->br, BROPT_MST_ENABLED)) { err = br_mst_set_state(p, 0, state, NULL); if (err) br_warn(p->br, "error setting MST state on port %u(%s)\n", p->port_no, netdev_name(p->dev)); } err = switchdev_port_attr_set(p->dev, &attr, NULL); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); else br_info(p->br, "port %u(%s) entered %s state\n", (unsigned int) p->port_no, p->dev->name, br_port_state_names[p->state]); if (p->br->stp_enabled == BR_KERNEL_STP) { switch (p->state) { case BR_STATE_BLOCKING: p->stp_xstats.transition_blk++; break; case BR_STATE_FORWARDING: p->stp_xstats.transition_fwd++; break; } } } u8 br_port_get_stp_state(const struct net_device *dev) { struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_rtnl(dev); if (!p) return BR_STATE_DISABLED; return p->state; } EXPORT_SYMBOL_GPL(br_port_get_stp_state); /* called under bridge lock */ struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no) { struct net_bridge_port *p; list_for_each_entry_rcu(p, &br->port_list, list, lockdep_is_held(&br->lock)) { if (p->port_no == port_no) return p; } return NULL; } /* called under bridge lock */ static int br_should_become_root_port(const struct net_bridge_port *p, u16 root_port) { struct net_bridge *br; struct net_bridge_port *rp; int t; br = p->br; if (p->state == BR_STATE_DISABLED || br_is_designated_port(p)) return 0; if (memcmp(&br->bridge_id, &p->designated_root, 8) <= 0) return 0; if (!root_port) return 1; rp = br_get_port(br, root_port); t = memcmp(&p->designated_root, &rp->designated_root, 8); if (t < 0) return 1; else if (t > 0) return 0; if (p->designated_cost + p->path_cost < rp->designated_cost + rp->path_cost) return 1; else if (p->designated_cost + p->path_cost > rp->designated_cost + rp->path_cost) return 0; t = memcmp(&p->designated_bridge, &rp->designated_bridge, 8); if (t < 0) return 1; else if (t > 0) return 0; if (p->designated_port < rp->designated_port) return 1; else if (p->designated_port > rp->designated_port) return 0; if (p->port_id < rp->port_id) return 1; return 0; } static void br_root_port_block(const struct net_bridge *br, struct net_bridge_port *p) { br_notice(br, "port %u(%s) tried to become root port (blocked)", (unsigned int) p->port_no, p->dev->name); br_set_state(p, BR_STATE_LISTENING); br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (br->forward_delay > 0) mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); } /* called under bridge lock */ static void br_root_selection(struct net_bridge *br) { struct net_bridge_port *p; u16 root_port = 0; list_for_each_entry(p, &br->port_list, list) { if (!br_should_become_root_port(p, root_port)) continue; if (p->flags & BR_ROOT_BLOCK) br_root_port_block(br, p); else root_port = p->port_no; } br->root_port = root_port; if (!root_port) { br->designated_root = br->bridge_id; br->root_path_cost = 0; } else { p = br_get_port(br, root_port); br->designated_root = p->designated_root; br->root_path_cost = p->designated_cost + p->path_cost; } } /* called under bridge lock */ void br_become_root_bridge(struct net_bridge *br) { br->max_age = br->bridge_max_age; br->hello_time = br->bridge_hello_time; br->forward_delay = br->bridge_forward_delay; br_topology_change_detection(br); del_timer(&br->tcn_timer); if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); mod_timer(&br->hello_timer, jiffies + br->hello_time); } } /* called under bridge lock */ void br_transmit_config(struct net_bridge_port *p) { struct br_config_bpdu bpdu; struct net_bridge *br; if (timer_pending(&p->hold_timer)) { p->config_pending = 1; return; } br = p->br; bpdu.topology_change = br->topology_change; bpdu.topology_change_ack = p->topology_change_ack; bpdu.root = br->designated_root; bpdu.root_path_cost = br->root_path_cost; bpdu.bridge_id = br->bridge_id; bpdu.port_id = p->port_id; if (br_is_root_bridge(br)) bpdu.message_age = 0; else { struct net_bridge_port *root = br_get_port(br, br->root_port); bpdu.message_age = (jiffies - root->designated_age) + MESSAGE_AGE_INCR; } bpdu.max_age = br->max_age; bpdu.hello_time = br->hello_time; bpdu.forward_delay = br->forward_delay; if (bpdu.message_age < br->max_age) { br_send_config_bpdu(p, &bpdu); p->topology_change_ack = 0; p->config_pending = 0; if (p->br->stp_enabled == BR_KERNEL_STP) mod_timer(&p->hold_timer, round_jiffies(jiffies + BR_HOLD_TIME)); } } /* called under bridge lock */ static void br_record_config_information(struct net_bridge_port *p, const struct br_config_bpdu *bpdu) { p->designated_root = bpdu->root; p->designated_cost = bpdu->root_path_cost; p->designated_bridge = bpdu->bridge_id; p->designated_port = bpdu->port_id; p->designated_age = jiffies - bpdu->message_age; mod_timer(&p->message_age_timer, jiffies + (bpdu->max_age - bpdu->message_age)); } /* called under bridge lock */ static void br_record_config_timeout_values(struct net_bridge *br, const struct br_config_bpdu *bpdu) { br->max_age = bpdu->max_age; br->hello_time = bpdu->hello_time; br->forward_delay = bpdu->forward_delay; __br_set_topology_change(br, bpdu->topology_change); } /* called under bridge lock */ void br_transmit_tcn(struct net_bridge *br) { struct net_bridge_port *p; p = br_get_port(br, br->root_port); if (p) br_send_tcn_bpdu(p); else br_notice(br, "root port %u not found for topology notice\n", br->root_port); } /* called under bridge lock */ static int br_should_become_designated_port(const struct net_bridge_port *p) { struct net_bridge *br; int t; br = p->br; if (br_is_designated_port(p)) return 1; if (memcmp(&p->designated_root, &br->designated_root, 8)) return 1; if (br->root_path_cost < p->designated_cost) return 1; else if (br->root_path_cost > p->designated_cost) return 0; t = memcmp(&br->bridge_id, &p->designated_bridge, 8); if (t < 0) return 1; else if (t > 0) return 0; if (p->port_id < p->designated_port) return 1; return 0; } /* called under bridge lock */ static void br_designated_port_selection(struct net_bridge *br) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && br_should_become_designated_port(p)) br_become_designated_port(p); } } /* called under bridge lock */ static int br_supersedes_port_info(const struct net_bridge_port *p, const struct br_config_bpdu *bpdu) { int t; t = memcmp(&bpdu->root, &p->designated_root, 8); if (t < 0) return 1; else if (t > 0) return 0; if (bpdu->root_path_cost < p->designated_cost) return 1; else if (bpdu->root_path_cost > p->designated_cost) return 0; t = memcmp(&bpdu->bridge_id, &p->designated_bridge, 8); if (t < 0) return 1; else if (t > 0) return 0; if (memcmp(&bpdu->bridge_id, &p->br->bridge_id, 8)) return 1; if (bpdu->port_id <= p->designated_port) return 1; return 0; } /* called under bridge lock */ static void br_topology_change_acknowledged(struct net_bridge *br) { br->topology_change_detected = 0; del_timer(&br->tcn_timer); } /* called under bridge lock */ void br_topology_change_detection(struct net_bridge *br) { int isroot = br_is_root_bridge(br); if (br->stp_enabled != BR_KERNEL_STP) return; br_info(br, "topology change detected, %s\n", isroot ? "propagating" : "sending tcn bpdu"); if (isroot) { __br_set_topology_change(br, 1); mod_timer(&br->topology_change_timer, jiffies + br->bridge_forward_delay + br->bridge_max_age); } else if (!br->topology_change_detected) { br_transmit_tcn(br); mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time); } br->topology_change_detected = 1; } /* called under bridge lock */ void br_config_bpdu_generation(struct net_bridge *br) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && br_is_designated_port(p)) br_transmit_config(p); } } /* called under bridge lock */ static void br_reply(struct net_bridge_port *p) { br_transmit_config(p); } /* called under bridge lock */ void br_configuration_update(struct net_bridge *br) { br_root_selection(br); br_designated_port_selection(br); } /* called under bridge lock */ void br_become_designated_port(struct net_bridge_port *p) { struct net_bridge *br; br = p->br; p->designated_root = br->designated_root; p->designated_cost = br->root_path_cost; p->designated_bridge = br->bridge_id; p->designated_port = p->port_id; } /* called under bridge lock */ static void br_make_blocking(struct net_bridge_port *p) { if (p->state != BR_STATE_DISABLED && p->state != BR_STATE_BLOCKING) { if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) br_topology_change_detection(p->br); br_set_state(p, BR_STATE_BLOCKING); br_ifinfo_notify(RTM_NEWLINK, NULL, p); del_timer(&p->forward_delay_timer); } } /* called under bridge lock */ static void br_make_forwarding(struct net_bridge_port *p) { struct net_bridge *br = p->br; if (p->state != BR_STATE_BLOCKING) return; if (br->stp_enabled == BR_NO_STP || br->forward_delay == 0) { br_set_state(p, BR_STATE_FORWARDING); br_topology_change_detection(br); del_timer(&p->forward_delay_timer); } else if (br->stp_enabled == BR_KERNEL_STP) br_set_state(p, BR_STATE_LISTENING); else br_set_state(p, BR_STATE_LEARNING); br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (br->forward_delay != 0) mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); } /* called under bridge lock */ void br_port_state_selection(struct net_bridge *br) { struct net_bridge_port *p; unsigned int liveports = 0; list_for_each_entry(p, &br->port_list, list) { if (p->state == BR_STATE_DISABLED) continue; /* Don't change port states if userspace is handling STP */ if (br->stp_enabled != BR_USER_STP) { if (p->port_no == br->root_port) { p->config_pending = 0; p->topology_change_ack = 0; br_make_forwarding(p); } else if (br_is_designated_port(p)) { del_timer(&p->message_age_timer); br_make_forwarding(p); } else { p->config_pending = 0; p->topology_change_ack = 0; br_make_blocking(p); } } if (p->state != BR_STATE_BLOCKING) br_multicast_enable_port(p); /* Multicast is not disabled for the port when it goes in * blocking state because the timers will expire and stop by * themselves without sending more queries. */ if (p->state == BR_STATE_FORWARDING) ++liveports; } if (liveports == 0) netif_carrier_off(br->dev); else netif_carrier_on(br->dev); } /* called under bridge lock */ static void br_topology_change_acknowledge(struct net_bridge_port *p) { p->topology_change_ack = 1; br_transmit_config(p); } /* called under bridge lock */ void br_received_config_bpdu(struct net_bridge_port *p, const struct br_config_bpdu *bpdu) { struct net_bridge *br; int was_root; p->stp_xstats.rx_bpdu++; br = p->br; was_root = br_is_root_bridge(br); if (br_supersedes_port_info(p, bpdu)) { br_record_config_information(p, bpdu); br_configuration_update(br); br_port_state_selection(br); if (!br_is_root_bridge(br) && was_root) { del_timer(&br->hello_timer); if (br->topology_change_detected) { del_timer(&br->topology_change_timer); br_transmit_tcn(br); mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time); } } if (p->port_no == br->root_port) { br_record_config_timeout_values(br, bpdu); br_config_bpdu_generation(br); if (bpdu->topology_change_ack) br_topology_change_acknowledged(br); } } else if (br_is_designated_port(p)) { br_reply(p); } } /* called under bridge lock */ void br_received_tcn_bpdu(struct net_bridge_port *p) { p->stp_xstats.rx_tcn++; if (br_is_designated_port(p)) { br_info(p->br, "port %u(%s) received tcn bpdu\n", (unsigned int) p->port_no, p->dev->name); br_topology_change_detection(p->br); br_topology_change_acknowledge(p); } } /* Change bridge STP parameter */ int br_set_hello_time(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); if (t < BR_MIN_HELLO_TIME || t > BR_MAX_HELLO_TIME) return -ERANGE; spin_lock_bh(&br->lock); br->bridge_hello_time = t; if (br_is_root_bridge(br)) br->hello_time = br->bridge_hello_time; spin_unlock_bh(&br->lock); return 0; } int br_set_max_age(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); if (t < BR_MIN_MAX_AGE || t > BR_MAX_MAX_AGE) return -ERANGE; spin_lock_bh(&br->lock); br->bridge_max_age = t; if (br_is_root_bridge(br)) br->max_age = br->bridge_max_age; spin_unlock_bh(&br->lock); return 0; } /* called under bridge lock */ int __set_ageing_time(struct net_device *dev, unsigned long t) { struct switchdev_attr attr = { .orig_dev = dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER, .u.ageing_time = jiffies_to_clock_t(t), }; int err; err = switchdev_port_attr_set(dev, &attr, NULL); if (err && err != -EOPNOTSUPP) return err; return 0; } /* Set time interval that dynamic forwarding entries live * For pure software bridge, allow values outside the 802.1 * standard specification for special cases: * 0 - entry never ages (all permanent) * 1 - entry disappears (no persistence) * * Offloaded switch entries maybe more restrictive */ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) { unsigned long t = clock_t_to_jiffies(ageing_time); int err; err = __set_ageing_time(br->dev, t); if (err) return err; spin_lock_bh(&br->lock); br->bridge_ageing_time = t; br->ageing_time = t; spin_unlock_bh(&br->lock); mod_delayed_work(system_long_wq, &br->gc_work, 0); return 0; } clock_t br_get_ageing_time(const struct net_device *br_dev) { const struct net_bridge *br; if (!netif_is_bridge_master(br_dev)) return 0; br = netdev_priv(br_dev); return jiffies_to_clock_t(br->ageing_time); } EXPORT_SYMBOL_GPL(br_get_ageing_time); /* called under bridge lock */ void __br_set_topology_change(struct net_bridge *br, unsigned char val) { unsigned long t; int err; if (br->stp_enabled == BR_KERNEL_STP && br->topology_change != val) { /* On topology change, set the bridge ageing time to twice the * forward delay. Otherwise, restore its default ageing time. */ if (val) { t = 2 * br->forward_delay; br_debug(br, "decreasing ageing time to %lu\n", t); } else { t = br->bridge_ageing_time; br_debug(br, "restoring ageing time to %lu\n", t); } err = __set_ageing_time(br->dev, t); if (err) br_warn(br, "error offloading ageing time\n"); else br->ageing_time = t; } br->topology_change = val; } void __br_set_forward_delay(struct net_bridge *br, unsigned long t) { br->bridge_forward_delay = t; if (br_is_root_bridge(br)) br->forward_delay = br->bridge_forward_delay; } int br_set_forward_delay(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); int err = -ERANGE; spin_lock_bh(&br->lock); if (br->stp_enabled != BR_NO_STP && (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY)) goto unlock; __br_set_forward_delay(br, t); err = 0; unlock: spin_unlock_bh(&br->lock); return err; } |
1 1 1 1 1 1 10 10 1 3 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu> * Patrick Schaaf <bof@bof.de> * Martin Josefsson <gandalf@wlug.westbo.se> */ /* Kernel module implementing an IP set type: the bitmap:ip,mac type */ #include <linux/module.h> #include <linux/ip.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/if_ether.h> #include <linux/netlink.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <net/netlink.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_bitmap.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Counter support added */ /* 2 Comment support added */ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("bitmap:ip,mac", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_bitmap:ip,mac"); #define MTYPE bitmap_ipmac #define HOST_MASK 32 #define IP_SET_BITMAP_STORED_TIMEOUT enum { MAC_UNSET, /* element is set, without MAC */ MAC_FILLED, /* element is set with MAC */ }; /* Type structure */ struct bitmap_ipmac { unsigned long *members; /* the set members */ u32 first_ip; /* host byte order, included in range */ u32 last_ip; /* host byte order, included in range */ u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[] /* MAC + data extensions */ __aligned(__alignof__(u64)); }; /* ADT structure for generic function args */ struct bitmap_ipmac_adt_elem { unsigned char ether[ETH_ALEN] __aligned(2); u16 id; u16 add_mac; }; struct bitmap_ipmac_elem { unsigned char ether[ETH_ALEN]; unsigned char filled; } __aligned(__alignof__(u64)); static u32 ip_to_id(const struct bitmap_ipmac *m, u32 ip) { return ip - m->first_ip; } #define get_elem(extensions, id, dsize) \ (struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) #define get_const_elem(extensions, id, dsize) \ (const struct bitmap_ipmac_elem *)(extensions + (id) * (dsize)) /* Common functions */ static int bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e, const struct bitmap_ipmac *map, size_t dsize) { const struct bitmap_ipmac_elem *elem; if (!test_bit(e->id, map->members)) return 0; elem = get_const_elem(map->extensions, e->id, dsize); if (e->add_mac && elem->filled == MAC_FILLED) return ether_addr_equal(e->ether, elem->ether); /* Trigger kernel to fill out the ethernet address */ return -EAGAIN; } static int bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize) { const struct bitmap_ipmac_elem *elem; if (!test_bit(id, map->members)) return 0; elem = get_const_elem(map->extensions, id, dsize); /* Timer not started for the incomplete elements */ return elem->filled == MAC_FILLED; } static int bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem) { return elem->filled == MAC_FILLED; } static int bitmap_ipmac_add_timeout(unsigned long *timeout, const struct bitmap_ipmac_adt_elem *e, const struct ip_set_ext *ext, struct ip_set *set, struct bitmap_ipmac *map, int mode) { u32 t = ext->timeout; if (mode == IPSET_ADD_START_STORED_TIMEOUT) { if (t == set->timeout) /* Timeout was not specified, get stored one */ t = *timeout; ip_set_timeout_set(timeout, t); } else { /* If MAC is unset yet, we store plain timeout value * because the timer is not activated yet * and we can reuse it later when MAC is filled out, * possibly by the kernel */ if (e->add_mac) ip_set_timeout_set(timeout, t); else *timeout = t; } return 0; } static int bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map, u32 flags, size_t dsize) { struct bitmap_ipmac_elem *elem; elem = get_elem(map->extensions, e->id, dsize); if (test_bit(e->id, map->members)) { if (elem->filled == MAC_FILLED) { if (e->add_mac && (flags & IPSET_FLAG_EXIST) && !ether_addr_equal(e->ether, elem->ether)) { /* memcpy isn't atomic */ clear_bit(e->id, map->members); smp_mb__after_atomic(); ether_addr_copy(elem->ether, e->ether); } return IPSET_ADD_FAILED; } else if (!e->add_mac) /* Already added without ethernet address */ return IPSET_ADD_FAILED; /* Fill the MAC address and trigger the timer activation */ clear_bit(e->id, map->members); smp_mb__after_atomic(); ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return IPSET_ADD_START_STORED_TIMEOUT; } else if (e->add_mac) { /* We can store MAC too */ ether_addr_copy(elem->ether, e->ether); elem->filled = MAC_FILLED; return 0; } elem->filled = MAC_UNSET; /* MAC is not stored yet, don't start timer */ return IPSET_ADD_STORE_PLAIN_TIMEOUT; } static int bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e, struct bitmap_ipmac *map) { return !test_and_clear_bit(e->id, map->members); } static int bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map, u32 id, size_t dsize) { const struct bitmap_ipmac_elem *elem = get_const_elem(map->extensions, id, dsize); return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip + id)) || (elem->filled == MAC_FILLED && nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether)); } static int bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map) { return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) || nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); } static int bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct bitmap_ipmac_adt_elem e = { .id = 0, .add_mac = 1 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); u32 ip; ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC)); if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; /* Backward compatibility: we don't check the second flag */ if (skb_mac_header(skb) < skb->head || (skb_mac_header(skb) + ETH_HLEN) > skb->data) return -EINVAL; e.id = ip_to_id(map, ip); if (opt->flags & IPSET_DIM_TWO_SRC) ether_addr_copy(e.ether, eth_hdr(skb)->h_source); else ether_addr_copy(e.ether, eth_hdr(skb)->h_dest); if (is_zero_ether_addr(e.ether)) return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct bitmap_ipmac *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct bitmap_ipmac_adt_elem e = { .id = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0; int ret = 0; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP])) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (ip < map->first_ip || ip > map->last_ip) return -IPSET_ERR_BITMAP_RANGE; e.id = ip_to_id(map, ip); if (tb[IPSET_ATTR_ETHER]) { if (nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN) return -IPSET_ERR_PROTOCOL; memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN); e.add_mac = 1; } ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } static bool bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b) { const struct bitmap_ipmac *x = a->data; const struct bitmap_ipmac *y = b->data; return x->first_ip == y->first_ip && x->last_ip == y->last_ip && a->timeout == b->timeout && a->extensions == b->extensions; } /* Plain variant */ #include "ip_set_bitmap_gen.h" /* Create bitmap:ip,mac type of sets */ static bool init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, u32 first_ip, u32 last_ip, u32 elements) { map->members = bitmap_zalloc(elements, GFP_KERNEL | __GFP_NOWARN); if (!map->members) return false; map->first_ip = first_ip; map->last_ip = last_ip; map->elements = elements; set->timeout = IPSET_NO_TIMEOUT; map->set = set; set->data = map; set->family = NFPROTO_IPV4; return true; } static int bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) { u32 first_ip = 0, last_ip = 0; u64 elements; struct bitmap_ipmac *map; int ret; if (unlikely(!tb[IPSET_ATTR_IP] || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip); if (ret) return ret; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &last_ip); if (ret) return ret; if (first_ip > last_ip) swap(first_ip, last_ip); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr >= HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(first_ip, last_ip, cidr); } else { return -IPSET_ERR_PROTOCOL; } elements = (u64)last_ip - first_ip + 1; if (elements > IPSET_BITMAP_MAX_RANGE + 1) return -IPSET_ERR_BITMAP_RANGE_SIZE; set->dsize = ip_set_elem_len(set, tb, sizeof(struct bitmap_ipmac_elem), __alignof__(struct bitmap_ipmac_elem)); map = ip_set_alloc(sizeof(*map) + elements * set->dsize); if (!map) return -ENOMEM; map->memsize = BITS_TO_LONGS(elements) * sizeof(unsigned long); set->variant = &bitmap_ipmac; if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) { ip_set_free(map); return -ENOMEM; } if (tb[IPSET_ATTR_TIMEOUT]) { set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); bitmap_ipmac_gc_init(set, bitmap_ipmac_gc); } return 0; } static struct ip_set_type bitmap_ipmac_type = { .name = "bitmap:ip,mac", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_MAC, .dimension = IPSET_DIM_TWO, .family = NFPROTO_IPV4, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create = bitmap_ipmac_create, .create_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_ETHER] = { .type = NLA_BINARY, .len = ETH_ALEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init bitmap_ipmac_init(void) { return ip_set_type_register(&bitmap_ipmac_type); } static void __exit bitmap_ipmac_fini(void) { rcu_barrier(); ip_set_type_unregister(&bitmap_ipmac_type); } module_init(bitmap_ipmac_init); module_exit(bitmap_ipmac_fini); |
5 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 1 5 2 2 1 1 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 23 23 23 1 1 1 1 1 5 5 5 1 1 1 1 1 1 1 5 1 1 1 1 35 35 35 41 40 1 38 1 1 1 35 2 1 35 1 1 34 1 1 1 1 2 3 5 5 5 5 5 5 5 5 5 3 2 28 26 22 5 12 17 35 25 4 1 5 1 1 1 4 1 26 1 1 1 1 27 1 1 27 1 27 23 17 23 23 23 23 23 24 35 24 22 3 23 20 3 3 3 23 23 4 2 4 3 1 35 2 4 1 1 33 33 2 33 1 34 1 35 34 1 34 1 1 34 34 1 34 1 34 1 34 1 34 1 34 1 33 1 1 35 34 1 34 1 33 1 35 35 35 34 34 1 34 1 35 35 35 1 1 1 26 58 55 7 11 58 53 12 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 58 58 57 58 58 177 89 847 843 177 1 129 128 4 4 4 13 10 4 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 | // SPDX-License-Identifier: GPL-2.0-only /* * VXLAN: Virtual eXtensible Local Area Network * * Copyright (c) 2012-2013 Vyatta Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/udp.h> #include <linux/igmp.h> #include <linux/if_ether.h> #include <linux/ethtool.h> #include <net/arp.h> #include <net/ndisc.h> #include <net/gro.h> #include <net/ipv6_stubs.h> #include <net/ip.h> #include <net/icmp.h> #include <net/rtnetlink.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/tun_proto.h> #include <net/vxlan.h> #include <net/nexthop.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> #endif #include "vxlan_private.h" #define VXLAN_VERSION "0.1" #define FDB_AGE_DEFAULT 300 /* 5 min */ #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ /* UDP port for VXLAN traffic. * The IANA assigned port is 4789, but the Linux default is 8472 * for compatibility with early adopters. */ static unsigned short vxlan_port __read_mostly = 8472; module_param_named(udp_port, vxlan_port, ushort, 0444); MODULE_PARM_DESC(udp_port, "Destination UDP port"); static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); unsigned int vxlan_net_id; const u8 all_zeros_mac[ETH_ALEN + 2]; static struct rtnl_link_ops vxlan_link_ops; static int vxlan_sock_add(struct vxlan_dev *vxlan); static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); /* salt for hash table */ static u32 vxlan_salt __read_mostly; static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) { return vs->flags & VXLAN_F_COLLECT_METADATA || ip_tunnel_collect_metadata(); } /* Find VXLAN socket based on network namespace, address family, UDP port, * enabled unshareable flags and socket device binding (see l3mdev with * non-default VRF). */ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, __be16 port, u32 flags, int ifindex) { struct vxlan_sock *vs; flags &= VXLAN_F_RCV_FLAGS; hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { if (inet_sk(vs->sock->sk)->inet_sport == port && vxlan_get_sk_family(vs) == family && vs->flags == flags && vs->sock->sk->sk_bound_dev_if == ifindex) return vs; } return NULL; } static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex, __be32 vni, struct vxlan_vni_node **vninode) { struct vxlan_vni_node *vnode; struct vxlan_dev_node *node; /* For flow based devices, map all packets to VNI 0 */ if (vs->flags & VXLAN_F_COLLECT_METADATA && !(vs->flags & VXLAN_F_VNIFILTER)) vni = 0; hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) { if (!node->vxlan) continue; vnode = NULL; if (node->vxlan->cfg.flags & VXLAN_F_VNIFILTER) { vnode = vxlan_vnifilter_lookup(node->vxlan, vni); if (!vnode) continue; } else if (node->vxlan->default_dst.remote_vni != vni) { continue; } if (IS_ENABLED(CONFIG_IPV6)) { const struct vxlan_config *cfg = &node->vxlan->cfg; if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) && cfg->remote_ifindex != ifindex) continue; } if (vninode) *vninode = vnode; return node->vxlan; } return NULL; } /* Look up VNI in a per net namespace table */ static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex, __be32 vni, sa_family_t family, __be16 port, u32 flags) { struct vxlan_sock *vs; vs = vxlan_find_sock(net, family, port, flags, ifindex); if (!vs) return NULL; return vxlan_vs_find_vni(vs, ifindex, vni, NULL); } /* Fill in neighbour message in skbuff. */ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, const struct vxlan_fdb *fdb, u32 portid, u32 seq, int type, unsigned int flags, const struct vxlan_rdst *rdst) { unsigned long now = jiffies; struct nda_cacheinfo ci; bool send_ip, send_eth; struct nlmsghdr *nlh; struct nexthop *nh; struct ndmsg *ndm; int nh_family; u32 nh_id; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; ndm = nlmsg_data(nlh); memset(ndm, 0, sizeof(*ndm)); send_eth = send_ip = true; rcu_read_lock(); nh = rcu_dereference(fdb->nh); if (nh) { nh_family = nexthop_get_family(nh); nh_id = nh->id; } rcu_read_unlock(); if (type == RTM_GETNEIGH) { if (rdst) { send_ip = !vxlan_addr_any(&rdst->remote_ip); ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; } else if (nh) { ndm->ndm_family = nh_family; } send_eth = !is_zero_ether_addr(fdb->eth_addr); } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; ndm->ndm_ifindex = vxlan->dev->ifindex; ndm->ndm_flags = fdb->flags; if (rdst && rdst->offloaded) ndm->ndm_flags |= NTF_OFFLOADED; ndm->ndm_type = RTN_UNICAST; if (!net_eq(dev_net(vxlan->dev), vxlan->net) && nla_put_s32(skb, NDA_LINK_NETNSID, peernet2id(dev_net(vxlan->dev), vxlan->net))) goto nla_put_failure; if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) goto nla_put_failure; if (nh) { if (nla_put_u32(skb, NDA_NH_ID, nh_id)) goto nla_put_failure; } else if (rdst) { if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) goto nla_put_failure; if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port && nla_put_be16(skb, NDA_PORT, rdst->remote_port)) goto nla_put_failure; if (rdst->remote_vni != vxlan->default_dst.remote_vni && nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) goto nla_put_failure; if (rdst->remote_ifindex && nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) goto nla_put_failure; } if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && nla_put_u32(skb, NDA_SRC_VNI, be32_to_cpu(fdb->vni))) goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t vxlan_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ + nla_total_size(sizeof(__be16)) /* NDA_PORT */ + nla_total_size(sizeof(__be32)) /* NDA_VNI */ + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */ + nla_total_size(sizeof(struct nda_cacheinfo)); } static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, struct vxlan_rdst *rd, int type) { struct net *net = dev_net(vxlan->dev); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd); if (err < 0) { /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan, const struct vxlan_fdb *fdb, const struct vxlan_rdst *rd, struct netlink_ext_ack *extack, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { fdb_info->info.dev = vxlan->dev; fdb_info->info.extack = extack; fdb_info->remote_ip = rd->remote_ip; fdb_info->remote_port = rd->remote_port; fdb_info->remote_vni = rd->remote_vni; fdb_info->remote_ifindex = rd->remote_ifindex; memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN); fdb_info->vni = fdb->vni; fdb_info->offloaded = rd->offloaded; fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER; } static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, struct vxlan_rdst *rd, bool adding, struct netlink_ext_ack *extack) { struct switchdev_notifier_vxlan_fdb_info info; enum switchdev_notifier_type notifier_type; int ret; if (WARN_ON(!rd)) return 0; notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE; vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info); ret = call_switchdev_notifiers(notifier_type, vxlan->dev, &info.info, extack); return notifier_to_errno(ret); } static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, struct vxlan_rdst *rd, int type, bool swdev_notify, struct netlink_ext_ack *extack) { int err; if (swdev_notify && rd) { switch (type) { case RTM_NEWNEIGH: err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, true, extack); if (err) return err; break; case RTM_DELNEIGH: vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, false, extack); break; } } __vxlan_fdb_notify(vxlan, fdb, rd, type); return 0; } static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb f = { .state = NUD_STALE, }; struct vxlan_rdst remote = { .remote_ip = *ipa, /* goes to NDA_DST */ .remote_vni = cpu_to_be32(VXLAN_N_VID), }; vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) { struct vxlan_fdb f = { .state = NUD_STALE, }; struct vxlan_rdst remote = { }; memcpy(f.eth_addr, eth_addr, ETH_ALEN); vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } /* Hash Ethernet address */ static u32 eth_hash(const unsigned char *addr) { u64 value = get_unaligned((u64 *)addr); /* only want 6 bytes */ #ifdef __BIG_ENDIAN value >>= 16; #else value <<= 16; #endif return hash_64(value, FDB_HASH_BITS); } u32 eth_vni_hash(const unsigned char *addr, __be32 vni) { /* use 1 byte of OUI and 3 bytes of NIC */ u32 key = get_unaligned((u32 *)(addr + 2)); return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1); } u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) return eth_vni_hash(mac, vni); else return eth_hash(mac); } /* Hash chain to use given mac address */ static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)]; } /* Look up Ethernet address in forwarding table */ static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni); struct vxlan_fdb *f; hlist_for_each_entry_rcu(f, head, hlist) { if (ether_addr_equal(mac, f->eth_addr)) { if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { if (vni == f->vni) return f; } else { return f; } } } return NULL; } static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { struct vxlan_fdb *f; f = __vxlan_find_mac(vxlan, mac, vni); if (f && f->used != jiffies) f->used = jiffies; return f; } /* caller should hold vxlan->hash_lock */ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, __be32 vni, __u32 ifindex) { struct vxlan_rdst *rd; list_for_each_entry(rd, &f->remotes, list) { if (vxlan_addr_equal(&rd->remote_ip, ip) && rd->remote_port == port && rd->remote_vni == vni && rd->remote_ifindex == ifindex) return rd; } return NULL; } int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); u8 eth_addr[ETH_ALEN + 2] = { 0 }; struct vxlan_rdst *rdst; struct vxlan_fdb *f; int rc = 0; if (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)) return -EINVAL; ether_addr_copy(eth_addr, mac); rcu_read_lock(); f = __vxlan_find_mac(vxlan, eth_addr, vni); if (!f) { rc = -ENOENT; goto out; } rdst = first_remote_rcu(f); vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info); out: rcu_read_unlock(); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc); static int vxlan_fdb_notify_one(struct notifier_block *nb, const struct vxlan_dev *vxlan, const struct vxlan_fdb *f, const struct vxlan_rdst *rdst, struct netlink_ext_ack *extack) { struct switchdev_notifier_vxlan_fdb_info fdb_info; int rc; vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, extack, &fdb_info); rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE, &fdb_info); return notifier_to_errno(rc); } int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; unsigned int h; int rc = 0; if (!netif_is_vxlan(dev)) return -EINVAL; vxlan = netdev_priv(dev); for (h = 0; h < FDB_HASH_SIZE; ++h) { spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) { if (f->vni == vni) { list_for_each_entry(rdst, &f->remotes, list) { rc = vxlan_fdb_notify_one(nb, vxlan, f, rdst, extack); if (rc) goto unlock; } } } spin_unlock_bh(&vxlan->hash_lock[h]); } return 0; unlock: spin_unlock_bh(&vxlan->hash_lock[h]); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_replay); void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) { struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; unsigned int h; if (!netif_is_vxlan(dev)) return; vxlan = netdev_priv(dev); for (h = 0; h < FDB_HASH_SIZE; ++h) { spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) if (f->vni == vni) list_for_each_entry(rdst, &f->remotes, list) rdst->offloaded = false; spin_unlock_bh(&vxlan->hash_lock[h]); } } EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload); /* Replace destination of unicast mac */ static int vxlan_fdb_replace(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, __be32 vni, __u32 ifindex, struct vxlan_rdst *oldrd) { struct vxlan_rdst *rd; rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); if (rd) return 0; rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); if (!rd) return 0; *oldrd = *rd; dst_cache_reset(&rd->dst_cache); rd->remote_ip = *ip; rd->remote_port = port; rd->remote_vni = vni; rd->remote_ifindex = ifindex; rd->offloaded = false; return 1; } /* Add/update destinations for multicast */ static int vxlan_fdb_append(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, __be32 vni, __u32 ifindex, struct vxlan_rdst **rdp) { struct vxlan_rdst *rd; rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); if (rd) return 0; rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) return -ENOMEM; if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { kfree(rd); return -ENOMEM; } rd->remote_ip = *ip; rd->remote_port = port; rd->offloaded = false; rd->remote_vni = vni; rd->remote_ifindex = ifindex; list_add_tail_rcu(&rd->list, &f->remotes); *rdp = rd; return 1; } static bool vxlan_parse_gpe_proto(struct vxlanhdr *hdr, __be16 *protocol) { struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)hdr; /* Need to have Next Protocol set for interfaces in GPE mode. */ if (!gpe->np_applied) return false; /* "The initial version is 0. If a receiver does not support the * version indicated it MUST drop the packet. */ if (gpe->version != 0) return false; /* "When the O bit is set to 1, the packet is an OAM packet and OAM * processing MUST occur." However, we don't implement OAM * processing, thus drop the packet. */ if (gpe->oam_flag) return false; *protocol = tun_p_to_eth_p(gpe->next_protocol); if (!*protocol) return false; return true; } static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, unsigned int off, struct vxlanhdr *vh, size_t hdrlen, __be32 vni_field, struct gro_remcsum *grc, bool nopartial) { size_t start, offset; if (skb->remcsum_offload) return vh; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; start = vxlan_rco_start(vni_field); offset = start + vxlan_rco_offset(vni_field); vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen, start, offset, grc, nopartial); skb->remcsum_offload = 1; return vh; } static struct vxlanhdr *vxlan_gro_prepare_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb, struct gro_remcsum *grc) { struct sk_buff *p; struct vxlanhdr *vh, *vh2; unsigned int hlen, off_vx; struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk); __be32 flags; skb_gro_remcsum_init(grc); off_vx = skb_gro_offset(skb); hlen = off_vx + sizeof(*vh); vh = skb_gro_header(skb, hlen, off_vx); if (unlikely(!vh)) return NULL; skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); flags = vh->vx_flags; if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), vh->vx_vni, grc, !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)); if (!vh) return NULL; } skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; vh2 = (struct vxlanhdr *)(p->data + off_vx); if (vh->vx_flags != vh2->vx_flags || vh->vx_vni != vh2->vx_vni) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } return vh; } static struct sk_buff *vxlan_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { struct sk_buff *pp = NULL; struct gro_remcsum grc; int flush = 1; if (vxlan_gro_prepare_receive(sk, head, skb, &grc)) { pp = call_gro_receive(eth_gro_receive, head, skb); flush = 0; } skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } static struct sk_buff *vxlan_gpe_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { const struct packet_offload *ptype; struct sk_buff *pp = NULL; struct gro_remcsum grc; struct vxlanhdr *vh; __be16 protocol; int flush = 1; vh = vxlan_gro_prepare_receive(sk, head, skb, &grc); if (vh) { if (!vxlan_parse_gpe_proto(vh, &protocol)) goto out; ptype = gro_find_receive_by_type(protocol); if (!ptype) goto out; pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; } out: skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { /* Sets 'skb->inner_mac_header' since we are always called with * 'skb->encapsulation' set. */ return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); } static int vxlan_gpe_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct vxlanhdr *vh = (struct vxlanhdr *)(skb->data + nhoff); const struct packet_offload *ptype; int err = -ENOSYS; __be16 protocol; if (!vxlan_parse_gpe_proto(vh, &protocol)) return err; ptype = gro_find_complete_by_type(protocol); if (ptype) err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); return err; } static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac, __u16 state, __be32 src_vni, __u16 ndm_flags) { struct vxlan_fdb *f; f = kmalloc(sizeof(*f), GFP_ATOMIC); if (!f) return NULL; f->state = state; f->flags = ndm_flags; f->updated = f->used = jiffies; f->vni = src_vni; f->nh = NULL; RCU_INIT_POINTER(f->vdev, vxlan); INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); memcpy(f->eth_addr, mac, ETH_ALEN); return f; } static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac, __be32 src_vni, struct vxlan_fdb *f) { ++vxlan->addrcnt; hlist_add_head_rcu(&f->hlist, vxlan_fdb_head(vxlan, mac, src_vni)); } static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, u32 nhid, struct netlink_ext_ack *extack) { struct nexthop *old_nh = rtnl_dereference(fdb->nh); struct nexthop *nh; int err = -EINVAL; if (old_nh && old_nh->id == nhid) return 0; nh = nexthop_find_by_id(vxlan->net, nhid); if (!nh) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto err_inval; } if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); nh = NULL; goto err_inval; } if (!nexthop_is_fdb(nh)) { NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop"); goto err_inval; } if (!nexthop_is_multipath(nh)) { NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group"); goto err_inval; } /* check nexthop group family */ switch (vxlan->default_dst.remote_ip.sa.sa_family) { case AF_INET: if (!nexthop_has_v4(nh)) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); goto err_inval; } break; case AF_INET6: if (nexthop_has_v4(nh)) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); goto err_inval; } } if (old_nh) { list_del_rcu(&fdb->nh_list); nexthop_put(old_nh); } rcu_assign_pointer(fdb->nh, nh); list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list); return 1; err_inval: if (nh) nexthop_put(nh); return err; } int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, struct vxlan_fdb **fdb, struct netlink_ext_ack *extack) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; int rc; if (vxlan->cfg.addrmax && vxlan->addrcnt >= vxlan->cfg.addrmax) return -ENOSPC; netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags); if (!f) return -ENOMEM; if (nhid) rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); else rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); if (rc < 0) goto errout; *fdb = f; return 0; errout: kfree(f); return rc; } static void __vxlan_fdb_free(struct vxlan_fdb *f) { struct vxlan_rdst *rd, *nd; struct nexthop *nh; nh = rcu_dereference_raw(f->nh); if (nh) { rcu_assign_pointer(f->nh, NULL); rcu_assign_pointer(f->vdev, NULL); nexthop_put(nh); } list_for_each_entry_safe(rd, nd, &f->remotes, list) { dst_cache_destroy(&rd->dst_cache); kfree(rd); } kfree(f); } static void vxlan_fdb_free(struct rcu_head *head) { struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); __vxlan_fdb_free(f); } static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, bool do_notify, bool swdev_notify) { struct vxlan_rdst *rd; netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr); --vxlan->addrcnt; if (do_notify) { if (rcu_access_pointer(f->nh)) vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH, swdev_notify, NULL); else list_for_each_entry(rd, &f->remotes, list) vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL); } hlist_del_rcu(&f->hlist); list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); } static void vxlan_dst_free(struct rcu_head *head) { struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu); dst_cache_destroy(&rd->dst_cache); kfree(rd); } static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 vni, __u32 ifindex, __u16 ndm_flags, struct vxlan_fdb *f, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { __u16 fdb_flags = (ndm_flags & ~NTF_USE); struct vxlan_rdst *rd = NULL; struct vxlan_rdst oldrd; int notify = 0; int rc = 0; int err; if (nhid && !rcu_access_pointer(f->nh)) { NL_SET_ERR_MSG(extack, "Cannot replace an existing non nexthop fdb with a nexthop"); return -EOPNOTSUPP; } if (nhid && (flags & NLM_F_APPEND)) { NL_SET_ERR_MSG(extack, "Cannot append to a nexthop fdb"); return -EOPNOTSUPP; } /* Do not allow an externally learned entry to take over an entry added * by the user. */ if (!(fdb_flags & NTF_EXT_LEARNED) || !(f->flags & NTF_VXLAN_ADDED_BY_USER)) { if (f->state != state) { f->state = state; f->updated = jiffies; notify = 1; } if (f->flags != fdb_flags) { f->flags = fdb_flags; f->updated = jiffies; notify = 1; } } if ((flags & NLM_F_REPLACE)) { /* Only change unicasts */ if (!(is_multicast_ether_addr(f->eth_addr) || is_zero_ether_addr(f->eth_addr))) { if (nhid) { rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); if (rc < 0) return rc; } else { rc = vxlan_fdb_replace(f, ip, port, vni, ifindex, &oldrd); } notify |= rc; } else { NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries"); return -EOPNOTSUPP; } } if ((flags & NLM_F_APPEND) && (is_multicast_ether_addr(f->eth_addr) || is_zero_ether_addr(f->eth_addr))) { rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); if (rc < 0) return rc; notify |= rc; } if (ndm_flags & NTF_USE) f->used = jiffies; if (notify) { if (rd == NULL) rd = first_remote_rtnl(f); err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH, swdev_notify, extack); if (err) goto err_notify; } return 0; err_notify: if (nhid) return err; if ((flags & NLM_F_REPLACE) && rc) *rd = oldrd; else if ((flags & NLM_F_APPEND) && rc) { list_del_rcu(&rd->list); call_rcu(&rd->rcu, vxlan_dst_free); } return err; } static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { __u16 fdb_flags = (ndm_flags & ~NTF_USE); struct vxlan_fdb *f; int rc; /* Disallow replace to add a multicast entry */ if ((flags & NLM_F_REPLACE) && (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) return -EOPNOTSUPP; netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni, vni, ifindex, fdb_flags, nhid, &f, extack); if (rc < 0) return rc; vxlan_fdb_insert(vxlan, mac, src_vni, f); rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, swdev_notify, extack); if (rc) goto err_notify; return 0; err_notify: vxlan_fdb_destroy(vxlan, f, false, false); return rc; } /* Add new entry to forwarding table -- assumes lock held */ int vxlan_fdb_update(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { struct vxlan_fdb *f; f = __vxlan_find_mac(vxlan, mac, src_vni); if (f) { if (flags & NLM_F_EXCL) { netdev_dbg(vxlan->dev, "lost race to create %pM\n", mac); return -EEXIST; } return vxlan_fdb_update_existing(vxlan, ip, state, flags, port, vni, ifindex, ndm_flags, f, nhid, swdev_notify, extack); } else { if (!(flags & NLM_F_CREATE)) return -ENOENT; return vxlan_fdb_update_create(vxlan, mac, ip, state, flags, port, src_vni, vni, ifindex, ndm_flags, nhid, swdev_notify, extack); } } static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, struct vxlan_rdst *rd, bool swdev_notify) { list_del_rcu(&rd->list); vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL); call_rcu(&rd->rcu, vxlan_dst_free); } static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, union vxlan_addr *ip, __be16 *port, __be32 *src_vni, __be32 *vni, u32 *ifindex, u32 *nhid, struct netlink_ext_ack *extack) { struct net *net = dev_net(vxlan->dev); int err; if (tb[NDA_NH_ID] && (tb[NDA_DST] || tb[NDA_VNI] || tb[NDA_IFINDEX] || tb[NDA_PORT])) { NL_SET_ERR_MSG(extack, "DST, VNI, ifindex and port are mutually exclusive with NH_ID"); return -EINVAL; } if (tb[NDA_DST]) { err = vxlan_nla_get_addr(ip, tb[NDA_DST]); if (err) { NL_SET_ERR_MSG(extack, "Unsupported address family"); return err; } } else { union vxlan_addr *remote = &vxlan->default_dst.remote_ip; if (remote->sa.sa_family == AF_INET) { ip->sin.sin_addr.s_addr = htonl(INADDR_ANY); ip->sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { ip->sin6.sin6_addr = in6addr_any; ip->sa.sa_family = AF_INET6; #endif } } if (tb[NDA_PORT]) { if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) { NL_SET_ERR_MSG(extack, "Invalid vxlan port"); return -EINVAL; } *port = nla_get_be16(tb[NDA_PORT]); } else { *port = vxlan->cfg.dst_port; } if (tb[NDA_VNI]) { if (nla_len(tb[NDA_VNI]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid vni"); return -EINVAL; } *vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); } else { *vni = vxlan->default_dst.remote_vni; } if (tb[NDA_SRC_VNI]) { if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid src vni"); return -EINVAL; } *src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI])); } else { *src_vni = vxlan->default_dst.remote_vni; } if (tb[NDA_IFINDEX]) { struct net_device *tdev; if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } *ifindex = nla_get_u32(tb[NDA_IFINDEX]); tdev = __dev_get_by_index(net, *ifindex); if (!tdev) { NL_SET_ERR_MSG(extack, "Device not found"); return -EADDRNOTAVAIL; } } else { *ifindex = 0; } if (tb[NDA_NH_ID]) *nhid = nla_get_u32(tb[NDA_NH_ID]); else *nhid = 0; return 0; } /* Add static entry (via netlink) */ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); /* struct net *net = dev_net(vxlan->dev); */ union vxlan_addr ip; __be16 port; __be32 src_vni, vni; u32 ifindex, nhid; u32 hash_index; int err; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { pr_info("RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state); return -EINVAL; } if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID])) return -EINVAL; err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, &nhid, extack); if (err) return err; if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) return -EAFNOSUPPORT; hash_index = fdb_head_index(vxlan, addr, src_vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, nhid, true, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; } int __vxlan_fdb_delete(struct vxlan_dev *vxlan, const unsigned char *addr, union vxlan_addr ip, __be16 port, __be32 src_vni, __be32 vni, u32 ifindex, bool swdev_notify) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; int err = -ENOENT; f = vxlan_find_mac(vxlan, addr, src_vni); if (!f) return err; if (!vxlan_addr_any(&ip)) { rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex); if (!rd) goto out; } /* remove a destination if it's not the only one on the list, * otherwise destroy the fdb entry */ if (rd && !list_is_singular(&f->remotes)) { vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify); goto out; } vxlan_fdb_destroy(vxlan, f, true, swdev_notify); out: return 0; } /* Delete entry (via netlink) */ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); union vxlan_addr ip; __be32 src_vni, vni; u32 ifindex, nhid; u32 hash_index; __be16 port; int err; err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, &nhid, extack); if (err) return err; hash_index = fdb_head_index(vxlan, addr, src_vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex, true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; } /* Dump forwarding table */ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct vxlan_dev *vxlan = netdev_priv(dev); unsigned int h; int err = 0; for (h = 0; h < FDB_HASH_SIZE; ++h) { struct vxlan_fdb *f; rcu_read_lock(); hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { struct vxlan_rdst *rd; if (rcu_access_pointer(f->nh)) { if (*idx < cb->args[2]) goto skip_nh; err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, NULL); if (err < 0) { rcu_read_unlock(); goto out; } skip_nh: *idx += 1; continue; } list_for_each_entry_rcu(rd, &f->remotes, list) { if (*idx < cb->args[2]) goto skip; err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, rd); if (err < 0) { rcu_read_unlock(); goto out; } skip: *idx += 1; } } rcu_read_unlock(); } out: return err; } static int vxlan_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; __be32 vni; int err; if (tb[NDA_VNI]) vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); else vni = vxlan->default_dst.remote_vni; rcu_read_lock(); f = __vxlan_find_mac(vxlan, addr, vni); if (!f) { NL_SET_ERR_MSG(extack, "Fdb entry not found"); err = -ENOENT; goto errout; } err = vxlan_fdb_info(skb, vxlan, f, portid, seq, RTM_NEWNEIGH, 0, first_remote_rcu(f)); errout: rcu_read_unlock(); return err; } /* Watch incoming packets to learn mapping between Ethernet address * and Tunnel endpoint. * Return true if packet is bogus and should be dropped. */ static bool vxlan_snoop(struct net_device *dev, union vxlan_addr *src_ip, const u8 *src_mac, u32 src_ifindex, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; u32 ifindex = 0; /* Ignore packets from invalid src-address */ if (!is_valid_ether_addr(src_mac)) return true; #if IS_ENABLED(CONFIG_IPV6) if (src_ip->sa.sa_family == AF_INET6 && (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)) ifindex = src_ifindex; #endif f = vxlan_find_mac(vxlan, src_mac, vni); if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) && rdst->remote_ifindex == ifindex)) return false; /* Don't migrate static entries, drop packets */ if (f->state & (NUD_PERMANENT | NUD_NOARP)) return true; /* Don't override an fdb with nexthop with a learnt entry */ if (rcu_access_pointer(f->nh)) return true; if (net_ratelimit()) netdev_info(dev, "%pM migrated from %pIS to %pIS\n", src_mac, &rdst->remote_ip.sa, &src_ip->sa); rdst->remote_ip = *src_ip; f->updated = jiffies; vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { u32 hash_index = fdb_head_index(vxlan, src_mac, vni); /* learned new entry */ spin_lock(&vxlan->hash_lock[hash_index]); /* close off race between vxlan_flush and incoming packets */ if (netif_running(dev)) vxlan_fdb_update(vxlan, src_mac, src_ip, NUD_REACHABLE, NLM_F_EXCL|NLM_F_CREATE, vxlan->cfg.dst_port, vni, vxlan->default_dst.remote_vni, ifindex, NTF_SELF, 0, true, NULL); spin_unlock(&vxlan->hash_lock[hash_index]); } return false; } static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) { struct vxlan_net *vn; if (!vs) return false; if (!refcount_dec_and_test(&vs->refcnt)) return false; vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); spin_lock(&vn->sock_lock); hlist_del_rcu(&vs->hlist); udp_tunnel_notify_del_rx_port(vs->sock, (vs->flags & VXLAN_F_GPE) ? UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); spin_unlock(&vn->sock_lock); return true; } static void vxlan_sock_release(struct vxlan_dev *vxlan) { struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); RCU_INIT_POINTER(vxlan->vn6_sock, NULL); #endif RCU_INIT_POINTER(vxlan->vn4_sock, NULL); synchronize_net(); if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vs_del_vnigrp(vxlan); else vxlan_vs_del_dev(vxlan); if (__vxlan_sock_release_prep(sock4)) { udp_tunnel_sock_release(sock4->sock); kfree(sock4); } #if IS_ENABLED(CONFIG_IPV6) if (__vxlan_sock_release_prep(sock6)) { udp_tunnel_sock_release(sock6->sock); kfree(sock6); } #endif } static bool vxlan_remcsum(struct vxlanhdr *unparsed, struct sk_buff *skb, u32 vxflags) { size_t start, offset; if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) goto out; start = vxlan_rco_start(unparsed->vx_vni); offset = start + vxlan_rco_offset(unparsed->vx_vni); if (!pskb_may_pull(skb, offset + sizeof(u16))) return false; skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); out: unparsed->vx_flags &= ~VXLAN_HF_RCO; unparsed->vx_vni &= VXLAN_VNI_MASK; return true; } static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, struct sk_buff *skb, u32 vxflags, struct vxlan_metadata *md) { struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed; struct metadata_dst *tun_dst; if (!(unparsed->vx_flags & VXLAN_HF_GBP)) goto out; md->gbp = ntohs(gbp->policy_id); tun_dst = (struct metadata_dst *)skb_dst(skb); if (tun_dst) { __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_dst->u.tun_info.key.tun_flags); tun_dst->u.tun_info.options_len = sizeof(*md); } if (gbp->dont_learn) md->gbp |= VXLAN_GBP_DONT_LEARN; if (gbp->policy_applied) md->gbp |= VXLAN_GBP_POLICY_APPLIED; /* In flow-based mode, GBP is carried in dst_metadata */ if (!(vxflags & VXLAN_F_COLLECT_METADATA)) skb->mark = md->gbp; out: unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; } static bool vxlan_set_mac(struct vxlan_dev *vxlan, struct vxlan_sock *vs, struct sk_buff *skb, __be32 vni) { union vxlan_addr saddr; u32 ifindex = skb->dev->ifindex; skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, vxlan->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) return false; /* Get address from the outer IP header */ if (vxlan_get_sk_family(vs) == AF_INET) { saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; saddr.sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr; saddr.sa.sa_family = AF_INET6; #endif } if ((vxlan->cfg.flags & VXLAN_F_LEARN) && vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni)) return false; return true; } static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, struct sk_buff *skb) { int err = 0; if (vxlan_get_sk_family(vs) == AF_INET) err = IP_ECN_decapsulate(oiph, skb); #if IS_ENABLED(CONFIG_IPV6) else err = IP6_ECN_decapsulate(oiph, skb); #endif if (unlikely(err) && log_ecn_error) { if (vxlan_get_sk_family(vs) == AF_INET) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &((struct iphdr *)oiph)->saddr, ((struct iphdr *)oiph)->tos); else net_info_ratelimited("non-ECT from %pI6\n", &((struct ipv6hdr *)oiph)->saddr); } return err <= 1; } /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) { struct vxlan_vni_node *vninode = NULL; struct vxlan_dev *vxlan; struct vxlan_sock *vs; struct vxlanhdr unparsed; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 protocol = htons(ETH_P_TEB); bool raw_proto = false; void *oiph; __be32 vni = 0; int nh; /* Need UDP and VXLAN header to be present */ if (!pskb_may_pull(skb, VXLAN_HLEN)) goto drop; unparsed = *vxlan_hdr(skb); /* VNI flag always required to be set */ if (!(unparsed.vx_flags & VXLAN_HF_VNI)) { netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", ntohl(vxlan_hdr(skb)->vx_flags), ntohl(vxlan_hdr(skb)->vx_vni)); /* Return non vxlan pkt */ goto drop; } unparsed.vx_flags &= ~VXLAN_HF_VNI; unparsed.vx_vni &= ~VXLAN_VNI_MASK; vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode); if (!vxlan) goto drop; /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. */ if (vs->flags & VXLAN_F_GPE) { if (!vxlan_parse_gpe_proto(&unparsed, &protocol)) goto drop; unparsed.vx_flags &= ~VXLAN_GPE_USED_BITS; raw_proto = true; } if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto, !net_eq(vxlan->net, dev_net(vxlan->dev)))) goto drop; if (vs->flags & VXLAN_F_REMCSUM_RX) if (unlikely(!vxlan_remcsum(&unparsed, skb, vs->flags))) goto drop; if (vxlan_collect_metadata(vs)) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tun_dst; __set_bit(IP_TUNNEL_KEY_BIT, flags); tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), flags, key32_to_tunnel_id(vni), sizeof(*md)); if (!tun_dst) goto drop; md = ip_tunnel_info_opts(&tun_dst->u.tun_info); skb_dst_set(skb, (struct dst_entry *)tun_dst); } else { memset(md, 0, sizeof(*md)); } if (vs->flags & VXLAN_F_GBP) vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md); /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ if (unparsed.vx_flags || unparsed.vx_vni) { /* If there are any unprocessed flags remaining treat * this as a malformed packet. This behavior diverges from * VXLAN RFC (RFC7348) which stipulates that bits in reserved * in reserved fields are to be ignored. The approach here * maintains compatibility with previous stack code, and also * is more robust and provides a little more security in * adding extensions to VXLAN. */ goto drop; } if (!raw_proto) { if (!vxlan_set_mac(vxlan, vs, skb, vni)) goto drop; } else { skb_reset_mac_header(skb); skb->dev = vxlan->dev; skb->pkt_type = PACKET_HOST; } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); if (!pskb_inet_may_pull(skb)) { DEV_STATS_INC(vxlan->dev, rx_length_errors); DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; } /* Get the outer header. */ oiph = skb->head + nh; if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { DEV_STATS_INC(vxlan->dev, rx_frame_errors); DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; } rcu_read_lock(); if (unlikely(!(vxlan->dev->flags & IFF_UP))) { rcu_read_unlock(); dev_core_stats_rx_dropped_inc(vxlan->dev); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_DROPS, 0); goto drop; } dev_sw_netstats_rx_add(vxlan->dev, skb->len); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len); gro_cells_receive(&vxlan->gro_cells, skb); rcu_read_unlock(); return 0; drop: /* Consume bad packet */ kfree_skb(skb); return 0; } /* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */ static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb) { struct vxlan_dev *vxlan; struct vxlan_sock *vs; struct vxlanhdr *hdr; __be32 vni; if (!pskb_may_pull(skb, skb_transport_offset(skb) + VXLAN_HLEN)) return -EINVAL; hdr = vxlan_hdr(skb); if (!(hdr->vx_flags & VXLAN_HF_VNI)) return -EINVAL; vs = rcu_dereference_sk_user_data(sk); if (!vs) return -ENOENT; vni = vxlan_vni(hdr->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, NULL); if (!vxlan) return -ENOENT; return 0; } static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct arphdr *parp; u8 *arpptr, *sha; __be32 sip, tip; struct neighbour *n; if (dev->flags & IFF_NOARP) goto out; if (!pskb_may_pull(skb, arp_hdr_len(dev))) { dev_core_stats_tx_dropped_inc(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); goto out; } parp = arp_hdr(skb); if ((parp->ar_hrd != htons(ARPHRD_ETHER) && parp->ar_hrd != htons(ARPHRD_IEEE802)) || parp->ar_pro != htons(ETH_P_IP) || parp->ar_op != htons(ARPOP_REQUEST) || parp->ar_hln != dev->addr_len || parp->ar_pln != 4) goto out; arpptr = (u8 *)parp + sizeof(struct arphdr); sha = arpptr; arpptr += dev->addr_len; /* sha */ memcpy(&sip, arpptr, sizeof(sip)); arpptr += sizeof(sip); arpptr += dev->addr_len; /* tha */ memcpy(&tip, arpptr, sizeof(tip)); if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) goto out; n = neigh_lookup(&arp_tbl, &tip, dev); if (n) { struct vxlan_fdb *f; struct sk_buff *reply; if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) { neigh_release(n); goto out; } f = vxlan_find_mac(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); goto out; } reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, n->ha, sha); neigh_release(n); if (reply == NULL) goto out; skb_reset_mac_header(reply); __skb_pull(reply, skb_network_offset(reply)); reply->ip_summed = CHECKSUM_UNNECESSARY; reply->pkt_type = PACKET_HOST; if (netif_rx(reply) == NET_RX_DROP) { dev_core_stats_rx_dropped_inc(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { union vxlan_addr ipa = { .sin.sin_addr.s_addr = tip, .sin.sin_family = AF_INET, }; vxlan_ip_miss(dev, &ipa); } out: consume_skb(skb); return NETDEV_TX_OK; } #if IS_ENABLED(CONFIG_IPV6) static struct sk_buff *vxlan_na_create(struct sk_buff *request, struct neighbour *n, bool isrouter) { struct net_device *dev = request->dev; struct sk_buff *reply; struct nd_msg *ns, *na; struct ipv6hdr *pip6; u8 *daddr; int na_olen = 8; /* opt hdr + ETH_ALEN for target */ int ns_olen; int i, len; if (dev == NULL || !pskb_may_pull(request, request->len)) return NULL; len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + sizeof(*na) + na_olen + dev->needed_tailroom; reply = alloc_skb(len, GFP_ATOMIC); if (reply == NULL) return NULL; reply->protocol = htons(ETH_P_IPV6); reply->dev = dev; skb_reserve(reply, LL_RESERVED_SPACE(request->dev)); skb_push(reply, sizeof(struct ethhdr)); skb_reset_mac_header(reply); ns = (struct nd_msg *)(ipv6_hdr(request) + 1); daddr = eth_hdr(request)->h_source; ns_olen = request->len - skb_network_offset(request) - sizeof(struct ipv6hdr) - sizeof(*ns); for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) { if (!ns->opt[i + 1]) { kfree_skb(reply); return NULL; } if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { daddr = ns->opt + i + sizeof(struct nd_opt_hdr); break; } } /* Ethernet header */ ether_addr_copy(eth_hdr(reply)->h_dest, daddr); ether_addr_copy(eth_hdr(reply)->h_source, n->ha); eth_hdr(reply)->h_proto = htons(ETH_P_IPV6); reply->protocol = htons(ETH_P_IPV6); skb_pull(reply, sizeof(struct ethhdr)); skb_reset_network_header(reply); skb_put(reply, sizeof(struct ipv6hdr)); /* IPv6 header */ pip6 = ipv6_hdr(reply); memset(pip6, 0, sizeof(struct ipv6hdr)); pip6->version = 6; pip6->priority = ipv6_hdr(request)->priority; pip6->nexthdr = IPPROTO_ICMPV6; pip6->hop_limit = 255; pip6->daddr = ipv6_hdr(request)->saddr; pip6->saddr = *(struct in6_addr *)n->primary_key; skb_pull(reply, sizeof(struct ipv6hdr)); skb_reset_transport_header(reply); /* Neighbor Advertisement */ na = skb_put_zero(reply, sizeof(*na) + na_olen); na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; na->icmph.icmp6_router = isrouter; na->icmph.icmp6_override = 1; na->icmph.icmp6_solicited = 1; na->target = ns->target; ether_addr_copy(&na->opt[2], n->ha); na->opt[0] = ND_OPT_TARGET_LL_ADDR; na->opt[1] = na_olen >> 3; na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6, csum_partial(na, sizeof(*na)+na_olen, 0)); pip6->payload_len = htons(sizeof(*na)+na_olen); skb_push(reply, sizeof(struct ipv6hdr)); reply->ip_summed = CHECKSUM_UNNECESSARY; return reply; } static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); const struct in6_addr *daddr; const struct ipv6hdr *iphdr; struct inet6_dev *in6_dev; struct neighbour *n; struct nd_msg *msg; rcu_read_lock(); in6_dev = __in6_dev_get(dev); if (!in6_dev) goto out; iphdr = ipv6_hdr(skb); daddr = &iphdr->daddr; msg = (struct nd_msg *)(iphdr + 1); if (ipv6_addr_loopback(daddr) || ipv6_addr_is_multicast(&msg->target)) goto out; n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev); if (n) { struct vxlan_fdb *f; struct sk_buff *reply; if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) { neigh_release(n); goto out; } f = vxlan_find_mac(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); goto out; } reply = vxlan_na_create(skb, n, !!(f ? f->flags & NTF_ROUTER : 0)); neigh_release(n); if (reply == NULL) goto out; if (netif_rx(reply) == NET_RX_DROP) { dev_core_stats_rx_dropped_inc(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { union vxlan_addr ipa = { .sin6.sin6_addr = msg->target, .sin6.sin6_family = AF_INET6, }; vxlan_ip_miss(dev, &ipa); } out: rcu_read_unlock(); consume_skb(skb); return NETDEV_TX_OK; } #endif static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); struct neighbour *n; if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) return false; n = NULL; switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: { struct iphdr *pip; if (!pskb_may_pull(skb, sizeof(struct iphdr))) return false; pip = ip_hdr(skb); n = neigh_lookup(&arp_tbl, &pip->daddr, dev); if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) { union vxlan_addr ipa = { .sin.sin_addr.s_addr = pip->daddr, .sin.sin_family = AF_INET, }; vxlan_ip_miss(dev, &ipa); return false; } break; } #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: { struct ipv6hdr *pip6; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) return false; pip6 = ipv6_hdr(skb); n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev); if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) { union vxlan_addr ipa = { .sin6.sin6_addr = pip6->daddr, .sin6.sin6_family = AF_INET6, }; vxlan_ip_miss(dev, &ipa); return false; } break; } #endif default: return false; } if (n) { bool diff; diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha); if (diff) { memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, dev->addr_len); memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len); } neigh_release(n); return diff; } return false; } static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, __be16 protocol) { struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh; gpe->np_applied = 1; gpe->next_protocol = tun_p_from_eth_p(protocol); if (!gpe->next_protocol) return -EPFNOSUPPORT; return 0; } static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, int iphdr_len, __be32 vni, struct vxlan_metadata *md, u32 vxflags, bool udp_sum) { struct vxlanhdr *vxh; int min_headroom; int err; int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; __be16 inner_protocol = htons(ETH_P_TEB); if ((vxflags & VXLAN_F_REMCSUM_TX) && skb->ip_summed == CHECKSUM_PARTIAL) { int csum_start = skb_checksum_start_offset(skb); if (csum_start <= VXLAN_MAX_REMCSUM_START && !(csum_start & VXLAN_RCO_SHIFT_MASK) && (skb->csum_offset == offsetof(struct udphdr, check) || skb->csum_offset == offsetof(struct tcphdr, check))) type |= SKB_GSO_TUNNEL_REMCSUM; } min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + VXLAN_HLEN + iphdr_len; /* Need space for new headers (invalidates iph ptr) */ err = skb_cow_head(skb, min_headroom); if (unlikely(err)) return err; err = iptunnel_handle_offloads(skb, type); if (err) return err; vxh = __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = VXLAN_HF_VNI; vxh->vx_vni = vxlan_vni_field(vni); if (type & SKB_GSO_TUNNEL_REMCSUM) { unsigned int start; start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr); vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset); vxh->vx_flags |= VXLAN_HF_RCO; if (!skb_is_gso(skb)) { skb->ip_summed = CHECKSUM_NONE; skb->encapsulation = 0; } } if (vxflags & VXLAN_F_GBP) vxlan_build_gbp_hdr(vxh, md); if (vxflags & VXLAN_F_GPE) { err = vxlan_build_gpe_hdr(vxh, skb->protocol); if (err < 0) return err; inner_protocol = skb->protocol; } skb_set_inner_protocol(skb, inner_protocol); return 0; } /* Bypass encapsulation if the destination is local */ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, struct vxlan_dev *dst_vxlan, __be32 vni, bool snoop) { union vxlan_addr loopback; union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; struct net_device *dev; int len = skb->len; skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb->dev = dst_vxlan->dev; __skb_pull(skb, skb_network_offset(skb)); if (remote_ip->sa.sa_family == AF_INET) { loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); loopback.sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { loopback.sin6.sin6_addr = in6addr_loopback; loopback.sa.sa_family = AF_INET6; #endif } rcu_read_lock(); dev = skb->dev; if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb(skb); goto drop; } if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop) vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni); dev_sw_netstats_tx_add(src_vxlan->dev, 1, len); vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len); if (__netif_rx(skb) == NET_RX_SUCCESS) { dev_sw_netstats_rx_add(dst_vxlan->dev, len); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX, len); } else { drop: dev_core_stats_rx_dropped_inc(dev); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } rcu_read_unlock(); } static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, struct vxlan_dev *vxlan, int addr_family, __be16 dst_port, int dst_ifindex, __be32 vni, struct dst_entry *dst, u32 rt_flags) { #if IS_ENABLED(CONFIG_IPV6) /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry. */ BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL); #endif /* Bypass encapsulation if the destination is local */ if (rt_flags & RTCF_LOCAL && !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && vxlan->cfg.flags & VXLAN_F_LOCALBYPASS) { struct vxlan_dev *dst_vxlan; dst_release(dst); dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, addr_family, dst_port, vxlan->cfg.flags); if (!dst_vxlan) { DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb(skb); return -ENOENT; } vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true); return 1; } return 0; } void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, __be32 default_vni, struct vxlan_rdst *rdst, bool did_rsc) { struct dst_cache *dst_cache; struct ip_tunnel_info *info; struct ip_tunnel_key *pkey; struct ip_tunnel_key key; struct vxlan_dev *vxlan = netdev_priv(dev); const struct iphdr *old_iph; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; unsigned int pkt_len = skb->len; __be16 src_port = 0, dst_port; struct dst_entry *ndst = NULL; int addr_family; __u8 tos, ttl; int ifindex; int err; u32 flags = vxlan->cfg.flags; bool use_cache; bool udp_sum = false; bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); bool no_eth_encap; __be32 vni = 0; no_eth_encap = flags & VXLAN_F_GPE && skb->protocol != htons(ETH_P_TEB); if (!skb_vlan_inet_prepare(skb, no_eth_encap)) goto drop; old_iph = ip_hdr(skb); info = skb_tunnel_info(skb); use_cache = ip_tunnel_dst_cache_usable(skb, info); if (rdst) { memset(&key, 0, sizeof(key)); pkey = &key; if (vxlan_addr_any(&rdst->remote_ip)) { if (did_rsc) { /* short-circuited back to local bridge */ vxlan_encap_bypass(skb, vxlan, vxlan, default_vni, true); return; } goto drop; } addr_family = vxlan->cfg.saddr.sa.sa_family; dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; vni = (rdst->remote_vni) ? : default_vni; ifindex = rdst->remote_ifindex; if (addr_family == AF_INET) { key.u.ipv4.src = vxlan->cfg.saddr.sin.sin_addr.s_addr; key.u.ipv4.dst = rdst->remote_ip.sin.sin_addr.s_addr; } else { key.u.ipv6.src = vxlan->cfg.saddr.sin6.sin6_addr; key.u.ipv6.dst = rdst->remote_ip.sin6.sin6_addr; } dst_cache = &rdst->dst_cache; md->gbp = skb->mark; if (flags & VXLAN_F_TTL_INHERIT) { ttl = ip_tunnel_get_ttl(old_iph, skb); } else { ttl = vxlan->cfg.ttl; if (!ttl && vxlan_addr_multicast(&rdst->remote_ip)) ttl = 1; } tos = vxlan->cfg.tos; if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); if (tos && !info) use_cache = false; if (addr_family == AF_INET) udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX); else udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); #if IS_ENABLED(CONFIG_IPV6) switch (vxlan->cfg.label_policy) { case VXLAN_LABEL_FIXED: key.label = vxlan->cfg.label; break; case VXLAN_LABEL_INHERIT: key.label = ip_tunnel_get_flowlabel(old_iph, skb); break; default: DEBUG_NET_WARN_ON_ONCE(1); goto drop; } #endif } else { if (!info) { WARN_ONCE(1, "%s: Missing encapsulation instructions\n", dev->name); goto drop; } pkey = &info->key; addr_family = ip_tunnel_info_af(info); dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; vni = tunnel_id_to_key32(info->key.tun_id); ifindex = 0; dst_cache = &info->dst_cache; if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { if (info->options_len < sizeof(*md)) goto drop; md = ip_tunnel_info_opts(info); } ttl = info->key.ttl; tos = info->key.tos; udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); } src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); rcu_read_lock(); if (addr_family == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); struct rtable *rt; __be16 df = 0; __be32 saddr; if (!ifindex) ifindex = sock4->sock->sk->sk_bound_dev_if; rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, ifindex, &saddr, pkey, src_port, dst_port, tos, use_cache ? dst_cache : NULL); if (IS_ERR(rt)) { err = PTR_ERR(rt); goto tx_error; } if (!info) { /* Bypass encapsulation if the destination is local */ err = encap_bypass_if_local(skb, dev, vxlan, AF_INET, dst_port, ifindex, vni, &rt->dst, rt->rt_flags); if (err) goto out_unlock; if (vxlan->cfg.df == VXLAN_DF_SET) { df = htons(IP_DF); } else if (vxlan->cfg.df == VXLAN_DF_INHERIT) { struct ethhdr *eth = eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_IPV6 || (ntohs(eth->h_proto) == ETH_P_IP && old_iph->frag_off & htons(IP_DF))) df = htons(IP_DF); } } else if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags)) { df = htons(IP_DF); } ndst = &rt->dst; err = skb_tunnel_check_pmtu(skb, ndst, vxlan_headroom(flags & VXLAN_F_GPE), netif_is_any_bridge_port(dev)); if (err < 0) { goto tx_error; } else if (err) { if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) goto tx_error; unclone->key.u.ipv4.src = pkey->u.ipv4.dst; unclone->key.u.ipv4.dst = saddr; } vxlan_encap_bypass(skb, vxlan, vxlan, vni, false); dst_release(ndst); goto out_unlock; } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr), vni, md, flags, udp_sum); if (err < 0) goto tx_error; udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr, pkey->u.ipv4.dst, tos, ttl, df, src_port, dst_port, xnet, !udp_sum); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct in6_addr saddr; if (!ifindex) ifindex = sock6->sock->sk->sk_bound_dev_if; ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock, ifindex, &saddr, pkey, src_port, dst_port, tos, use_cache ? dst_cache : NULL); if (IS_ERR(ndst)) { err = PTR_ERR(ndst); ndst = NULL; goto tx_error; } if (!info) { u32 rt6i_flags = dst_rt6_info(ndst)->rt6i_flags; err = encap_bypass_if_local(skb, dev, vxlan, AF_INET6, dst_port, ifindex, vni, ndst, rt6i_flags); if (err) goto out_unlock; } err = skb_tunnel_check_pmtu(skb, ndst, vxlan_headroom((flags & VXLAN_F_GPE) | VXLAN_F_IPV6), netif_is_any_bridge_port(dev)); if (err < 0) { goto tx_error; } else if (err) { if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) goto tx_error; unclone->key.u.ipv6.src = pkey->u.ipv6.dst; unclone->key.u.ipv6.dst = saddr; } vxlan_encap_bypass(skb, vxlan, vxlan, vni, false); dst_release(ndst); goto out_unlock; } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); skb_scrub_packet(skb, xnet); err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), vni, md, flags, udp_sum); if (err < 0) goto tx_error; udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, &saddr, &pkey->u.ipv6.dst, tos, ttl, pkey->label, src_port, dst_port, !udp_sum); #endif } vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); out_unlock: rcu_read_unlock(); return; drop: dev_core_stats_tx_dropped_inc(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); return; tx_error: rcu_read_unlock(); if (err == -ELOOP) DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) DEV_STATS_INC(dev, tx_carrier_errors); dst_release(ndst); DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb(skb); } static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, struct vxlan_fdb *f, __be32 vni, bool did_rsc) { struct vxlan_rdst nh_rdst; struct nexthop *nh; bool do_xmit; u32 hash; memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); hash = skb_get_hash(skb); rcu_read_lock(); nh = rcu_dereference(f->nh); if (!nh) { rcu_read_unlock(); goto drop; } do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); rcu_read_unlock(); if (likely(do_xmit)) vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); else goto drop; return; drop: dev_core_stats_tx_dropped_inc(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); } static netdev_tx_t vxlan_xmit_nhid(struct sk_buff *skb, struct net_device *dev, u32 nhid, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst nh_rdst; struct nexthop *nh; bool do_xmit; u32 hash; memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); hash = skb_get_hash(skb); rcu_read_lock(); nh = nexthop_find_by_id(dev_net(dev), nhid); if (unlikely(!nh || !nexthop_is_fdb(nh) || !nexthop_is_multipath(nh))) { rcu_read_unlock(); goto drop; } do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); rcu_read_unlock(); if (vxlan->cfg.saddr.sa.sa_family != nh_rdst.remote_ip.sa.sa_family) goto drop; if (likely(do_xmit)) vxlan_xmit_one(skb, dev, vni, &nh_rdst, false); else goto drop; return NETDEV_TX_OK; drop: dev_core_stats_tx_dropped_inc(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); return NETDEV_TX_OK; } /* Transmit local packets over Vxlan * * Outer IP header inherits ECN and DF from inner header. * Outer UDP destination is the VXLAN assigned port. * source port is based on hash of flow */ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst, *fdst = NULL; const struct ip_tunnel_info *info; struct vxlan_fdb *f; struct ethhdr *eth; __be32 vni = 0; u32 nhid = 0; bool did_rsc; info = skb_tunnel_info(skb); skb_reset_mac_header(skb); if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { if (info && info->mode & IP_TUNNEL_INFO_BRIDGE && info->mode & IP_TUNNEL_INFO_TX) { vni = tunnel_id_to_key32(info->key.tun_id); nhid = info->key.nhid; } else { if (info && info->mode & IP_TUNNEL_INFO_TX) vxlan_xmit_one(skb, dev, vni, NULL, false); else kfree_skb(skb); return NETDEV_TX_OK; } } if (vxlan->cfg.flags & VXLAN_F_PROXY) { eth = eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_ARP) return arp_reduce(dev, skb, vni); #if IS_ENABLED(CONFIG_IPV6) else if (ntohs(eth->h_proto) == ETH_P_IPV6 && pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)) && ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1); if (m->icmph.icmp6_code == 0 && m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) return neigh_reduce(dev, skb, vni); } #endif } if (nhid) return vxlan_xmit_nhid(skb, dev, nhid, vni); if (vxlan->cfg.flags & VXLAN_F_MDB) { struct vxlan_mdb_entry *mdb_entry; rcu_read_lock(); mdb_entry = vxlan_mdb_entry_skb_get(vxlan, skb, vni); if (mdb_entry) { netdev_tx_t ret; ret = vxlan_mdb_xmit(vxlan, mdb_entry, skb); rcu_read_unlock(); return ret; } rcu_read_unlock(); } eth = eth_hdr(skb); f = vxlan_find_mac(vxlan, eth->h_dest, vni); did_rsc = false; if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) && (ntohs(eth->h_proto) == ETH_P_IP || ntohs(eth->h_proto) == ETH_P_IPV6)) { did_rsc = route_shortcircuit(dev, skb); if (did_rsc) f = vxlan_find_mac(vxlan, eth->h_dest, vni); } if (f == NULL) { f = vxlan_find_mac(vxlan, all_zeros_mac, vni); if (f == NULL) { if ((vxlan->cfg.flags & VXLAN_F_L2MISS) && !is_multicast_ether_addr(eth->h_dest)) vxlan_fdb_miss(vxlan, eth->h_dest); dev_core_stats_tx_dropped_inc(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb(skb); return NETDEV_TX_OK; } } if (rcu_access_pointer(f->nh)) { vxlan_xmit_nh(skb, dev, f, (vni ? : vxlan->default_dst.remote_vni), did_rsc); } else { list_for_each_entry_rcu(rdst, &f->remotes, list) { struct sk_buff *skb1; if (!fdst) { fdst = rdst; continue; } skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); } if (fdst) vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); else kfree_skb(skb); } return NETDEV_TX_OK; } /* Walk the forwarding table and purge stale entries */ static void vxlan_cleanup(struct timer_list *t) { struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; unsigned int h; if (!netif_running(vxlan->dev)) return; for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; spin_lock(&vxlan->hash_lock[h]); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); unsigned long timeout; if (f->state & (NUD_PERMANENT | NUD_NOARP)) continue; if (f->flags & NTF_EXT_LEARNED) continue; timeout = f->used + vxlan->cfg.age_interval * HZ; if (time_before_eq(timeout, jiffies)) { netdev_dbg(vxlan->dev, "garbage collect %pM\n", f->eth_addr); f->state = NUD_STALE; vxlan_fdb_destroy(vxlan, f, true, true); } else if (time_before(timeout, next_timer)) next_timer = timeout; } spin_unlock(&vxlan->hash_lock[h]); } mod_timer(&vxlan->age_timer, next_timer); } static void vxlan_vs_del_dev(struct vxlan_dev *vxlan) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); spin_lock(&vn->sock_lock); hlist_del_init_rcu(&vxlan->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) hlist_del_init_rcu(&vxlan->hlist6.hlist); #endif spin_unlock(&vn->sock_lock); } static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan, struct vxlan_dev_node *node) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); __be32 vni = vxlan->default_dst.remote_vni; node->vxlan = vxlan; spin_lock(&vn->sock_lock); hlist_add_head_rcu(&node->hlist, vni_head(vs, vni)); spin_unlock(&vn->sock_lock); } /* Setup stats when device is created */ static int vxlan_init(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); int err; if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_init(vxlan); err = gro_cells_init(&vxlan->gro_cells, dev); if (err) goto err_vnigroup_uninit; err = vxlan_mdb_init(vxlan); if (err) goto err_gro_cells_destroy; netdev_lockdep_set_classes(dev); return 0; err_gro_cells_destroy: gro_cells_destroy(&vxlan->gro_cells); err_vnigroup_uninit: if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); return err; } static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni) { struct vxlan_fdb *f; u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); f = __vxlan_find_mac(vxlan, all_zeros_mac, vni); if (f) vxlan_fdb_destroy(vxlan, f, true, true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); } static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); vxlan_mdb_fini(vxlan); if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); gro_cells_destroy(&vxlan->gro_cells); vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni); } /* Start ageing timer and join group when device is brought up */ static int vxlan_open(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); int ret; ret = vxlan_sock_add(vxlan); if (ret < 0) return ret; ret = vxlan_multicast_join(vxlan); if (ret) { vxlan_sock_release(vxlan); return ret; } if (vxlan->cfg.age_interval) mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); return ret; } struct vxlan_fdb_flush_desc { bool ignore_default_entry; unsigned long state; unsigned long state_mask; unsigned long flags; unsigned long flags_mask; __be32 src_vni; u32 nhid; __be32 vni; __be16 port; union vxlan_addr dst_ip; }; static bool vxlan_fdb_is_default_entry(const struct vxlan_fdb *f, const struct vxlan_dev *vxlan) { return is_zero_ether_addr(f->eth_addr) && f->vni == vxlan->cfg.vni; } static bool vxlan_fdb_nhid_matches(const struct vxlan_fdb *f, u32 nhid) { struct nexthop *nh = rtnl_dereference(f->nh); return nh && nh->id == nhid; } static bool vxlan_fdb_flush_matches(const struct vxlan_fdb *f, const struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { if (desc->state_mask && (f->state & desc->state_mask) != desc->state) return false; if (desc->flags_mask && (f->flags & desc->flags_mask) != desc->flags) return false; if (desc->ignore_default_entry && vxlan_fdb_is_default_entry(f, vxlan)) return false; if (desc->src_vni && f->vni != desc->src_vni) return false; if (desc->nhid && !vxlan_fdb_nhid_matches(f, desc->nhid)) return false; return true; } static bool vxlan_fdb_flush_should_match_remotes(const struct vxlan_fdb_flush_desc *desc) { return desc->vni || desc->port || desc->dst_ip.sa.sa_family; } static bool vxlan_fdb_flush_remote_matches(const struct vxlan_fdb_flush_desc *desc, const struct vxlan_rdst *rd) { if (desc->vni && rd->remote_vni != desc->vni) return false; if (desc->port && rd->remote_port != desc->port) return false; if (desc->dst_ip.sa.sa_family && !vxlan_addr_equal(&rd->remote_ip, &desc->dst_ip)) return false; return true; } static void vxlan_fdb_flush_match_remotes(struct vxlan_fdb *f, struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc, bool *p_destroy_fdb) { bool remotes_flushed = false; struct vxlan_rdst *rd, *tmp; list_for_each_entry_safe(rd, tmp, &f->remotes, list) { if (!vxlan_fdb_flush_remote_matches(desc, rd)) continue; vxlan_fdb_dst_destroy(vxlan, f, rd, true); remotes_flushed = true; } *p_destroy_fdb = remotes_flushed && list_empty(&f->remotes); } /* Purge the forwarding table */ static void vxlan_flush(struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); unsigned int h; for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); if (!vxlan_fdb_flush_matches(f, vxlan, desc)) continue; if (match_remotes) { bool destroy_fdb = false; vxlan_fdb_flush_match_remotes(f, vxlan, desc, &destroy_fdb); if (!destroy_fdb) continue; } vxlan_fdb_destroy(vxlan, f, true, true); } spin_unlock_bh(&vxlan->hash_lock[h]); } } static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = { [NDA_SRC_VNI] = { .type = NLA_U32 }, [NDA_NH_ID] = { .type = NLA_U32 }, [NDA_VNI] = { .type = NLA_U32 }, [NDA_PORT] = { .type = NLA_U16 }, [NDA_DST] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), [NDA_NDM_STATE_MASK] = { .type = NLA_U16 }, [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 }, }; #define VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF) #define VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP) #define VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_EXT_LEARNED | NTF_OFFLOADED | \ NTF_ROUTER) static int vxlan_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = {}; struct ndmsg *ndm = nlmsg_data(nlh); struct nlattr *tb[NDA_MAX + 1]; u8 ndm_flags; int err; ndm_flags = ndm->ndm_flags & ~VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS; err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, vxlan_del_bulk_policy, extack); if (err) return err; if (ndm_flags & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS) { NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set"); return -EINVAL; } if (ndm->ndm_state & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES) { NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm state bits set"); return -EINVAL; } desc.state = ndm->ndm_state; desc.flags = ndm_flags; if (tb[NDA_NDM_STATE_MASK]) desc.state_mask = nla_get_u16(tb[NDA_NDM_STATE_MASK]); if (tb[NDA_NDM_FLAGS_MASK]) desc.flags_mask = nla_get_u8(tb[NDA_NDM_FLAGS_MASK]); if (tb[NDA_SRC_VNI]) desc.src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI])); if (tb[NDA_NH_ID]) desc.nhid = nla_get_u32(tb[NDA_NH_ID]); if (tb[NDA_VNI]) desc.vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); if (tb[NDA_PORT]) desc.port = nla_get_be16(tb[NDA_PORT]); if (tb[NDA_DST]) { union vxlan_addr ip; err = vxlan_nla_get_addr(&ip, tb[NDA_DST]); if (err) { NL_SET_ERR_MSG_ATTR(extack, tb[NDA_DST], "Unsupported address family"); return err; } desc.dst_ip = ip; } vxlan_flush(vxlan, &desc); return 0; } /* Cleanup timer and forwarding table on shutdown */ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = { /* Default entry is deleted at vxlan_uninit. */ .ignore_default_entry = true, .state = 0, .state_mask = NUD_PERMANENT | NUD_NOARP, }; vxlan_multicast_leave(vxlan); del_timer_sync(&vxlan->age_timer); vxlan_flush(vxlan, &desc); vxlan_sock_release(vxlan); return 0; } /* Stub, nothing needs to be done. */ static void vxlan_set_multicast_list(struct net_device *dev) { } static int vxlan_change_mtu(struct net_device *dev, int new_mtu) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; struct net_device *lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); /* This check is different than dev->max_mtu, because it looks at * the lowerdev->mtu, rather than the static dev->max_mtu */ if (lowerdev) { int max_mtu = lowerdev->mtu - vxlan_headroom(vxlan->cfg.flags); if (new_mtu > max_mtu) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); struct ip_tunnel_info *info = skb_tunnel_info(skb); __be16 sport, dport; sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); dport = info->key.tp_dst ? : vxlan->cfg.dst_port; if (ip_tunnel_info_af(info) == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); struct rtable *rt; if (!sock4) return -EIO; rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, 0, &info->key.u.ipv4.src, &info->key, sport, dport, info->key.tos, &info->dst_cache); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); } else { #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct dst_entry *ndst; if (!sock6) return -EIO; ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock, 0, &info->key.u.ipv6.src, &info->key, sport, dport, info->key.tos, &info->dst_cache); if (IS_ERR(ndst)) return PTR_ERR(ndst); dst_release(ndst); #else /* !CONFIG_IPV6 */ return -EPFNOSUPPORT; #endif } info->key.tp_src = sport; info->key.tp_dst = dport; return 0; } static const struct net_device_ops vxlan_netdev_ether_ops = { .ndo_init = vxlan_init, .ndo_uninit = vxlan_uninit, .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, .ndo_set_rx_mode = vxlan_set_multicast_list, .ndo_change_mtu = vxlan_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_fdb_add = vxlan_fdb_add, .ndo_fdb_del = vxlan_fdb_delete, .ndo_fdb_del_bulk = vxlan_fdb_delete_bulk, .ndo_fdb_dump = vxlan_fdb_dump, .ndo_fdb_get = vxlan_fdb_get, .ndo_mdb_add = vxlan_mdb_add, .ndo_mdb_del = vxlan_mdb_del, .ndo_mdb_del_bulk = vxlan_mdb_del_bulk, .ndo_mdb_dump = vxlan_mdb_dump, .ndo_mdb_get = vxlan_mdb_get, .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; static const struct net_device_ops vxlan_netdev_raw_ops = { .ndo_init = vxlan_init, .ndo_uninit = vxlan_uninit, .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, .ndo_change_mtu = vxlan_change_mtu, .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; /* Info for udev, that this is a virtual tunnel endpoint */ static const struct device_type vxlan_type = { .name = "vxlan", }; /* Calls the ndo_udp_tunnel_add of the caller in order to * supply the listening VXLAN udp ports. Callers are expected * to implement the ndo_udp_tunnel_add. */ static void vxlan_offload_rx_ports(struct net_device *dev, bool push) { struct vxlan_sock *vs; struct net *net = dev_net(dev); struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int i; spin_lock(&vn->sock_lock); for (i = 0; i < PORT_HASH_SIZE; ++i) { hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { unsigned short type; if (vs->flags & VXLAN_F_GPE) type = UDP_TUNNEL_TYPE_VXLAN_GPE; else type = UDP_TUNNEL_TYPE_VXLAN; if (push) udp_tunnel_push_rx_port(dev, vs->sock, type); else udp_tunnel_drop_rx_port(dev, vs->sock, type); } } spin_unlock(&vn->sock_lock); } /* Initialize the device structure. */ static void vxlan_setup(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); unsigned int h; eth_hw_addr_random(dev); ether_setup(dev); dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &vxlan_type); dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; dev->vlan_features = dev->features; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; netif_keep_dst(dev); dev->priv_flags |= IFF_NO_QUEUE; dev->change_proto_down = true; dev->lltx = true; /* MTU range: 68 - 65535 */ dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = ETH_MAX_MTU; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; INIT_LIST_HEAD(&vxlan->next); timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); vxlan->dev = dev; for (h = 0; h < FDB_HASH_SIZE; ++h) { spin_lock_init(&vxlan->hash_lock[h]); INIT_HLIST_HEAD(&vxlan->fdb_head[h]); } } static void vxlan_ether_setup(struct net_device *dev) { dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->netdev_ops = &vxlan_netdev_ether_ops; } static void vxlan_raw_setup(struct net_device *dev) { dev->header_ops = NULL; dev->type = ARPHRD_NONE; dev->hard_header_len = 0; dev->addr_len = 0; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->netdev_ops = &vxlan_netdev_raw_ops; } static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_UNSPEC] = { .strict_start_type = IFLA_VXLAN_LOCALBYPASS }, [IFLA_VXLAN_ID] = { .type = NLA_U32 }, [IFLA_VXLAN_GROUP] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, [IFLA_VXLAN_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, [IFLA_VXLAN_LABEL] = { .type = NLA_U32 }, [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) }, [IFLA_VXLAN_PROXY] = { .type = NLA_U8 }, [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_COLLECT_METADATA] = { .type = NLA_U8 }, [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, [IFLA_VXLAN_GPE] = { .type = NLA_FLAG, }, [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, [IFLA_VXLAN_DF] = { .type = NLA_U8 }, [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided link layer address is not Ethernet"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided Ethernet address is not unicast"); return -EADDRNOTAVAIL; } } if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], "MTU must be between 68 and 65535"); return -EINVAL; } } if (!data) { NL_SET_ERR_MSG(extack, "Required attributes not provided to perform the operation"); return -EINVAL; } if (data[IFLA_VXLAN_ID]) { u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); if (id >= VXLAN_N_VID) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_ID], "VXLAN ID must be lower than 16777216"); return -ERANGE; } } if (data[IFLA_VXLAN_PORT_RANGE]) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); if (ntohs(p->high) < ntohs(p->low)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_PORT_RANGE], "Invalid source port range"); return -EINVAL; } } if (data[IFLA_VXLAN_DF]) { enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]); if (df < 0 || df > VXLAN_DF_MAX) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_DF], "Invalid DF attribute"); return -EINVAL; } } return 0; } static void vxlan_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version)); strscpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver)); } static int vxlan_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; struct net_device *lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); if (!lowerdev) { cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; cmd->base.speed = SPEED_UNKNOWN; return 0; } return __ethtool_get_link_ksettings(lowerdev, cmd); } static const struct ethtool_ops vxlan_ethtool_ops = { .get_drvinfo = vxlan_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = vxlan_get_link_ksettings, }; static struct socket *vxlan_create_sock(struct net *net, bool ipv6, __be16 port, u32 flags, int ifindex) { struct socket *sock; struct udp_port_cfg udp_conf; int err; memset(&udp_conf, 0, sizeof(udp_conf)); if (ipv6) { udp_conf.family = AF_INET6; udp_conf.use_udp6_rx_checksums = !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); udp_conf.ipv6_v6only = 1; } else { udp_conf.family = AF_INET; } udp_conf.local_udp_port = port; udp_conf.bind_ifindex = ifindex; /* Open UDP socket */ err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) return ERR_PTR(err); udp_allow_gso(sock->sk); return sock; } /* Create new listen socket if needed */ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, __be16 port, u32 flags, int ifindex) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; struct socket *sock; unsigned int h; struct udp_tunnel_sock_cfg tunnel_cfg; vs = kzalloc(sizeof(*vs), GFP_KERNEL); if (!vs) return ERR_PTR(-ENOMEM); for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vs->vni_list[h]); sock = vxlan_create_sock(net, ipv6, port, flags, ifindex); if (IS_ERR(sock)) { kfree(vs); return ERR_CAST(sock); } vs->sock = sock; refcount_set(&vs->refcnt, 1); vs->flags = (flags & VXLAN_F_RCV_FLAGS); spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); udp_tunnel_notify_add_rx_port(sock, (vs->flags & VXLAN_F_GPE) ? UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); spin_unlock(&vn->sock_lock); /* Mark socket as an encapsulation socket. */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = vs; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = vxlan_rcv; tunnel_cfg.encap_err_lookup = vxlan_err_lookup; tunnel_cfg.encap_destroy = NULL; if (vs->flags & VXLAN_F_GPE) { tunnel_cfg.gro_receive = vxlan_gpe_gro_receive; tunnel_cfg.gro_complete = vxlan_gpe_gro_complete; } else { tunnel_cfg.gro_receive = vxlan_gro_receive; tunnel_cfg.gro_complete = vxlan_gro_complete; } setup_udp_tunnel_sock(net, sock, &tunnel_cfg); return vs; } static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA; struct vxlan_sock *vs = NULL; struct vxlan_dev_node *node; int l3mdev_index = 0; if (vxlan->cfg.remote_ifindex) l3mdev_index = l3mdev_master_upper_ifindex_by_index( vxlan->net, vxlan->cfg.remote_ifindex); if (!vxlan->cfg.no_share) { spin_lock(&vn->sock_lock); vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, vxlan->cfg.dst_port, vxlan->cfg.flags, l3mdev_index); if (vs && !refcount_inc_not_zero(&vs->refcnt)) { spin_unlock(&vn->sock_lock); return -EBUSY; } spin_unlock(&vn->sock_lock); } if (!vs) vs = vxlan_socket_create(vxlan->net, ipv6, vxlan->cfg.dst_port, vxlan->cfg.flags, l3mdev_index); if (IS_ERR(vs)) return PTR_ERR(vs); #if IS_ENABLED(CONFIG_IPV6) if (ipv6) { rcu_assign_pointer(vxlan->vn6_sock, vs); node = &vxlan->hlist6; } else #endif { rcu_assign_pointer(vxlan->vn4_sock, vs); node = &vxlan->hlist4; } if (metadata && (vxlan->cfg.flags & VXLAN_F_VNIFILTER)) vxlan_vs_add_vnigrp(vxlan, vs, ipv6); else vxlan_vs_add_dev(vs, vxlan, node); return 0; } static int vxlan_sock_add(struct vxlan_dev *vxlan) { bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA; bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata; bool ipv4 = !ipv6 || metadata; int ret = 0; RCU_INIT_POINTER(vxlan->vn4_sock, NULL); #if IS_ENABLED(CONFIG_IPV6) RCU_INIT_POINTER(vxlan->vn6_sock, NULL); if (ipv6) { ret = __vxlan_sock_add(vxlan, true); if (ret < 0 && ret != -EAFNOSUPPORT) ipv4 = false; } #endif if (ipv4) ret = __vxlan_sock_add(vxlan, false); if (ret < 0) vxlan_sock_release(vxlan); return ret; } int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan, struct vxlan_config *conf, __be32 vni) { struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); struct vxlan_dev *tmp; list_for_each_entry(tmp, &vn->vxlan_list, next) { if (tmp == vxlan) continue; if (tmp->cfg.flags & VXLAN_F_VNIFILTER) { if (!vxlan_vnifilter_lookup(tmp, vni)) continue; } else if (tmp->cfg.vni != vni) { continue; } if (tmp->cfg.dst_port != conf->dst_port) continue; if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) != (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6))) continue; if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) && tmp->cfg.remote_ifindex != conf->remote_ifindex) continue; return -EEXIST; } return 0; } static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, struct net_device **lower, struct vxlan_dev *old, struct netlink_ext_ack *extack) { bool use_ipv6 = false; if (conf->flags & VXLAN_F_GPE) { /* For now, allow GPE only together with * COLLECT_METADATA. This can be relaxed later; in such * case, the other side of the PtP link will have to be * provided. */ if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) || !(conf->flags & VXLAN_F_COLLECT_METADATA)) { NL_SET_ERR_MSG(extack, "VXLAN GPE does not support this combination of attributes"); return -EINVAL; } } if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) { /* Unless IPv6 is explicitly requested, assume IPv4 */ conf->remote_ip.sa.sa_family = AF_INET; conf->saddr.sa.sa_family = AF_INET; } else if (!conf->remote_ip.sa.sa_family) { conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family; } else if (!conf->saddr.sa.sa_family) { conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family; } if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) { NL_SET_ERR_MSG(extack, "Local and remote address must be from the same family"); return -EINVAL; } if (vxlan_addr_multicast(&conf->saddr)) { NL_SET_ERR_MSG(extack, "Local address cannot be multicast"); return -EINVAL; } if (conf->saddr.sa.sa_family == AF_INET6) { if (!IS_ENABLED(CONFIG_IPV6)) { NL_SET_ERR_MSG(extack, "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; } use_ipv6 = true; conf->flags |= VXLAN_F_IPV6; if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) { int local_type = ipv6_addr_type(&conf->saddr.sin6.sin6_addr); int remote_type = ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr); if (local_type & IPV6_ADDR_LINKLOCAL) { if (!(remote_type & IPV6_ADDR_LINKLOCAL) && (remote_type != IPV6_ADDR_ANY)) { NL_SET_ERR_MSG(extack, "Invalid combination of local and remote address scopes"); return -EINVAL; } conf->flags |= VXLAN_F_IPV6_LINKLOCAL; } else { if (remote_type == (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) { NL_SET_ERR_MSG(extack, "Invalid combination of local and remote address scopes"); return -EINVAL; } conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL; } } } if (conf->label && !use_ipv6) { NL_SET_ERR_MSG(extack, "Label attribute only applies to IPv6 VXLAN devices"); return -EINVAL; } if (conf->label_policy && !use_ipv6) { NL_SET_ERR_MSG(extack, "Label policy only applies to IPv6 VXLAN devices"); return -EINVAL; } if (conf->remote_ifindex) { struct net_device *lowerdev; lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex); if (!lowerdev) { NL_SET_ERR_MSG(extack, "Invalid local interface, device not found"); return -ENODEV; } #if IS_ENABLED(CONFIG_IPV6) if (use_ipv6) { struct inet6_dev *idev = __in6_dev_get(lowerdev); if (idev && idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 support disabled by administrator"); return -EPERM; } } #endif *lower = lowerdev; } else { if (vxlan_addr_multicast(&conf->remote_ip)) { NL_SET_ERR_MSG(extack, "Local interface required for multicast remote destination"); return -EINVAL; } #if IS_ENABLED(CONFIG_IPV6) if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) { NL_SET_ERR_MSG(extack, "Local interface required for link-local local/remote addresses"); return -EINVAL; } #endif *lower = NULL; } if (!conf->dst_port) { if (conf->flags & VXLAN_F_GPE) conf->dst_port = htons(IANA_VXLAN_GPE_UDP_PORT); else conf->dst_port = htons(vxlan_port); } if (!conf->age_interval) conf->age_interval = FDB_AGE_DEFAULT; if (vxlan_vni_in_use(src_net, old, conf, conf->vni)) { NL_SET_ERR_MSG(extack, "A VXLAN device with the specified VNI already exists"); return -EEXIST; } return 0; } static void vxlan_config_apply(struct net_device *dev, struct vxlan_config *conf, struct net_device *lowerdev, struct net *src_net, bool changelink) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; unsigned short needed_headroom = ETH_HLEN; int max_mtu = ETH_MAX_MTU; u32 flags = conf->flags; if (!changelink) { if (flags & VXLAN_F_GPE) vxlan_raw_setup(dev); else vxlan_ether_setup(dev); if (conf->mtu) dev->mtu = conf->mtu; vxlan->net = src_net; } dst->remote_vni = conf->vni; memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip)); if (lowerdev) { dst->remote_ifindex = conf->remote_ifindex; netif_inherit_tso_max(dev, lowerdev); needed_headroom = lowerdev->hard_header_len; needed_headroom += lowerdev->needed_headroom; dev->needed_tailroom = lowerdev->needed_tailroom; max_mtu = lowerdev->mtu - vxlan_headroom(flags); if (max_mtu < ETH_MIN_MTU) max_mtu = ETH_MIN_MTU; if (!changelink && !conf->mtu) dev->mtu = max_mtu; } if (dev->mtu > max_mtu) dev->mtu = max_mtu; if (flags & VXLAN_F_COLLECT_METADATA) flags |= VXLAN_F_IPV6; needed_headroom += vxlan_headroom(flags); dev->needed_headroom = needed_headroom; memcpy(&vxlan->cfg, conf, sizeof(*conf)); } static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, struct vxlan_config *conf, bool changelink, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *lowerdev; int ret; ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack); if (ret) return ret; vxlan_config_apply(dev, conf, lowerdev, src_net, changelink); return 0; } static int __vxlan_dev_create(struct net *net, struct net_device *dev, struct vxlan_config *conf, struct netlink_ext_ack *extack) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *remote_dev = NULL; struct vxlan_fdb *f = NULL; bool unregister = false; struct vxlan_rdst *dst; int err; dst = &vxlan->default_dst; err = vxlan_dev_configure(net, dev, conf, false, extack); if (err) return err; dev->ethtool_ops = &vxlan_ethtool_ops; /* create an fdb entry for a valid default destination */ if (!vxlan_addr_any(&dst->remote_ip)) { err = vxlan_fdb_create(vxlan, all_zeros_mac, &dst->remote_ip, NUD_REACHABLE | NUD_PERMANENT, vxlan->cfg.dst_port, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, NTF_SELF, 0, &f, extack); if (err) return err; } err = register_netdevice(dev); if (err) goto errout; unregister = true; if (dst->remote_ifindex) { remote_dev = __dev_get_by_index(net, dst->remote_ifindex); if (!remote_dev) { err = -ENODEV; goto errout; } err = netdev_upper_dev_link(remote_dev, dev, extack); if (err) goto errout; } err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto unlink; if (f) { vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f); /* notify default fdb entry */ err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, true, extack); if (err) { vxlan_fdb_destroy(vxlan, f, false, false); if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); goto unregister; } } list_add(&vxlan->next, &vn->vxlan_list); if (remote_dev) dst->remote_dev = remote_dev; return 0; unlink: if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); errout: /* unregister_netdevice() destroys the default FDB entry with deletion * notification. But the addition notification was not sent yet, so * destroy the entry by hand here. */ if (f) __vxlan_fdb_free(f); unregister: if (unregister) unregister_netdevice(dev); return err; } /* Set/clear flags based on attribute */ static int vxlan_nl2flag(struct vxlan_config *conf, struct nlattr *tb[], int attrtype, unsigned long mask, bool changelink, bool changelink_supported, struct netlink_ext_ack *extack) { unsigned long flags; if (!tb[attrtype]) return 0; if (changelink && !changelink_supported) { vxlan_flag_attr_error(attrtype, extack); return -EOPNOTSUPP; } if (vxlan_policy[attrtype].type == NLA_FLAG) flags = conf->flags | mask; else if (nla_get_u8(tb[attrtype])) flags = conf->flags | mask; else flags = conf->flags & ~mask; conf->flags = flags; return 0; } static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], struct net_device *dev, struct vxlan_config *conf, bool changelink, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); int err = 0; memset(conf, 0, sizeof(*conf)); /* if changelink operation, start with old existing cfg */ if (changelink) memcpy(conf, &vxlan->cfg, sizeof(*conf)); if (data[IFLA_VXLAN_ID]) { __be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID])); if (changelink && (vni != conf->vni)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI"); return -EOPNOTSUPP; } conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID])); } if (data[IFLA_VXLAN_GROUP]) { if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP], "New group address family does not match old group"); return -EOPNOTSUPP; } conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]); conf->remote_ip.sa.sa_family = AF_INET; } else if (data[IFLA_VXLAN_GROUP6]) { if (!IS_ENABLED(CONFIG_IPV6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; } if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "New group address family does not match old group"); return -EOPNOTSUPP; } conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]); conf->remote_ip.sa.sa_family = AF_INET6; } if (data[IFLA_VXLAN_LOCAL]) { if (changelink && (conf->saddr.sa.sa_family != AF_INET)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old"); return -EOPNOTSUPP; } conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]); conf->saddr.sa.sa_family = AF_INET; } else if (data[IFLA_VXLAN_LOCAL6]) { if (!IS_ENABLED(CONFIG_IPV6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; } if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "New local address family does not match old"); return -EOPNOTSUPP; } /* TODO: respect scope id */ conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]); conf->saddr.sa.sa_family = AF_INET6; } if (data[IFLA_VXLAN_LINK]) conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]); if (data[IFLA_VXLAN_TOS]) conf->tos = nla_get_u8(data[IFLA_VXLAN_TOS]); if (data[IFLA_VXLAN_TTL]) conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); if (data[IFLA_VXLAN_TTL_INHERIT]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_TTL_INHERIT, VXLAN_F_TTL_INHERIT, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_LABEL]) conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) & IPV6_FLOWLABEL_MASK; if (data[IFLA_VXLAN_LABEL_POLICY]) conf->label_policy = nla_get_u32(data[IFLA_VXLAN_LABEL_POLICY]); if (data[IFLA_VXLAN_LEARNING]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING, VXLAN_F_LEARN, changelink, true, extack); if (err) return err; } else if (!changelink) { /* default to learn on a new device */ conf->flags |= VXLAN_F_LEARN; } if (data[IFLA_VXLAN_AGEING]) conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); if (data[IFLA_VXLAN_PROXY]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_PROXY, VXLAN_F_PROXY, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_RSC]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_RSC, VXLAN_F_RSC, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_L2MISS]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L2MISS, VXLAN_F_L2MISS, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_L3MISS]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L3MISS, VXLAN_F_L3MISS, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_LIMIT]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LIMIT], "Cannot change limit"); return -EOPNOTSUPP; } conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); } if (data[IFLA_VXLAN_COLLECT_METADATA]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_COLLECT_METADATA, VXLAN_F_COLLECT_METADATA, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_PORT_RANGE]) { if (!changelink) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); conf->port_min = ntohs(p->low); conf->port_max = ntohs(p->high); } else { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE], "Cannot change port range"); return -EOPNOTSUPP; } } if (data[IFLA_VXLAN_PORT]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT], "Cannot change port"); return -EOPNOTSUPP; } conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); } if (data[IFLA_VXLAN_UDP_CSUM]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_UDP_CSUM], "Cannot change UDP_CSUM flag"); return -EOPNOTSUPP; } if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM])) conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; } if (data[IFLA_VXLAN_LOCALBYPASS]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS, VXLAN_F_LOCALBYPASS, changelink, true, extack); if (err) return err; } else if (!changelink) { /* default to local bypass on a new device */ conf->flags |= VXLAN_F_LOCALBYPASS; } if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, VXLAN_F_UDP_ZERO_CSUM6_TX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, VXLAN_F_UDP_ZERO_CSUM6_RX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_REMCSUM_TX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_TX, VXLAN_F_REMCSUM_TX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_REMCSUM_RX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_RX, VXLAN_F_REMCSUM_RX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_GBP]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP, VXLAN_F_GBP, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_GPE]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GPE, VXLAN_F_GPE, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL, VXLAN_F_REMCSUM_NOPARTIAL, changelink, false, extack); if (err) return err; } if (tb[IFLA_MTU]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], "Cannot change mtu"); return -EOPNOTSUPP; } conf->mtu = nla_get_u32(tb[IFLA_MTU]); } if (data[IFLA_VXLAN_DF]) conf->df = nla_get_u8(data[IFLA_VXLAN_DF]); if (data[IFLA_VXLAN_VNIFILTER]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_VNIFILTER, VXLAN_F_VNIFILTER, changelink, false, extack); if (err) return err; if ((conf->flags & VXLAN_F_VNIFILTER) && !(conf->flags & VXLAN_F_COLLECT_METADATA)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_VNIFILTER], "vxlan vnifilter only valid in collect metadata mode"); return -EINVAL; } } return 0; } static int vxlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct vxlan_config conf; int err; err = vxlan_nl2conf(tb, data, dev, &conf, false, extack); if (err) return err; return __vxlan_dev_create(src_net, dev, &conf, extack); } static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *lowerdev; struct vxlan_config conf; struct vxlan_rdst *dst; int err; dst = &vxlan->default_dst; err = vxlan_nl2conf(tb, data, dev, &conf, true, extack); if (err) return err; err = vxlan_config_validate(vxlan->net, &conf, &lowerdev, vxlan, extack); if (err) return err; if (dst->remote_dev == lowerdev) lowerdev = NULL; err = netdev_adjacent_change_prepare(dst->remote_dev, lowerdev, dev, extack); if (err) return err; /* handle default dst entry */ if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) { u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); if (!vxlan_addr_any(&conf.remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, &conf.remote_ip, NUD_REACHABLE | NUD_PERMANENT, NLM_F_APPEND | NLM_F_CREATE, vxlan->cfg.dst_port, conf.vni, conf.vni, conf.remote_ifindex, NTF_SELF, 0, true, extack); if (err) { spin_unlock_bh(&vxlan->hash_lock[hash_index]); netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; } } if (!vxlan_addr_any(&dst->remote_ip)) __vxlan_fdb_delete(vxlan, all_zeros_mac, dst->remote_ip, vxlan->cfg.dst_port, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); /* If vni filtering device, also update fdb entries of * all vnis that were using default remote ip */ if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { err = vxlan_vnilist_update_group(vxlan, &dst->remote_ip, &conf.remote_ip, extack); if (err) { netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; } } } if (conf.age_interval != vxlan->cfg.age_interval) mod_timer(&vxlan->age_timer, jiffies); netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev); if (lowerdev && lowerdev != dst->remote_dev) dst->remote_dev = lowerdev; vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true); return 0; } static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = { /* Default entry is deleted at vxlan_uninit. */ .ignore_default_entry = true, }; vxlan_flush(vxlan, &desc); list_del(&vxlan->next); unregister_netdevice_queue(dev, head); if (vxlan->default_dst.remote_dev) netdev_upper_dev_unlink(vxlan->default_dst.remote_dev, dev); } static size_t vxlan_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL_INHERIT */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_DF */ nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LABEL_POLICY */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */ /* IFLA_VXLAN_PORT_RANGE */ nla_total_size(sizeof(struct ifla_vxlan_port_range)) + nla_total_size(0) + /* IFLA_VXLAN_GBP */ nla_total_size(0) + /* IFLA_VXLAN_GPE */ nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */ 0; } static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { const struct vxlan_dev *vxlan = netdev_priv(dev); const struct vxlan_rdst *dst = &vxlan->default_dst; struct ifla_vxlan_port_range ports = { .low = htons(vxlan->cfg.port_min), .high = htons(vxlan->cfg.port_max), }; if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni))) goto nla_put_failure; if (!vxlan_addr_any(&dst->remote_ip)) { if (dst->remote_ip.sa.sa_family == AF_INET) { if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP, dst->remote_ip.sin.sin_addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6, &dst->remote_ip.sin6.sin6_addr)) goto nla_put_failure; #endif } } if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) goto nla_put_failure; if (!vxlan_addr_any(&vxlan->cfg.saddr)) { if (vxlan->cfg.saddr.sa.sa_family == AF_INET) { if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL, vxlan->cfg.saddr.sin.sin_addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6, &vxlan->cfg.saddr.sin6.sin6_addr)) goto nla_put_failure; #endif } } if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT, !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) || nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) || nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || nla_put_u32(skb, IFLA_VXLAN_LABEL_POLICY, vxlan->cfg.label_policy) || nla_put_u8(skb, IFLA_VXLAN_LEARNING, !!(vxlan->cfg.flags & VXLAN_F_LEARN)) || nla_put_u8(skb, IFLA_VXLAN_PROXY, !!(vxlan->cfg.flags & VXLAN_F_PROXY)) || nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->cfg.flags & VXLAN_F_RSC)) || nla_put_u8(skb, IFLA_VXLAN_L2MISS, !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) || nla_put_u8(skb, IFLA_VXLAN_L3MISS, !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) || nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA, !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) || nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) || nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) || nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) || nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM, !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) || nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) || nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) || nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS, !!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS))) goto nla_put_failure; if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_GBP && nla_put_flag(skb, IFLA_VXLAN_GBP)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_GPE && nla_put_flag(skb, IFLA_VXLAN_GPE)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL && nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_VNIFILTER && nla_put_u8(skb, IFLA_VXLAN_VNIFILTER, !!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct net *vxlan_get_link_net(const struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); return READ_ONCE(vxlan->net); } static struct rtnl_link_ops vxlan_link_ops __read_mostly = { .kind = "vxlan", .maxtype = IFLA_VXLAN_MAX, .policy = vxlan_policy, .priv_size = sizeof(struct vxlan_dev), .setup = vxlan_setup, .validate = vxlan_validate, .newlink = vxlan_newlink, .changelink = vxlan_changelink, .dellink = vxlan_dellink, .get_size = vxlan_get_size, .fill_info = vxlan_fill_info, .get_link_net = vxlan_get_link_net, }; struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf) { struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; int err; memset(&tb, 0, sizeof(tb)); dev = rtnl_create_link(net, name, name_assign_type, &vxlan_link_ops, tb, NULL); if (IS_ERR(dev)) return dev; err = __vxlan_dev_create(net, dev, conf, NULL); if (err < 0) { free_netdev(dev); return ERR_PTR(err); } err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) { LIST_HEAD(list_kill); vxlan_dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); return ERR_PTR(err); } return dev; } EXPORT_SYMBOL_GPL(vxlan_dev_create); static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, struct net_device *dev) { struct vxlan_dev *vxlan, *next; LIST_HEAD(list_kill); list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { struct vxlan_rdst *dst = &vxlan->default_dst; /* In case we created vxlan device with carrier * and we loose the carrier due to module unload * we also need to remove vxlan device. In other * cases, it's not necessary and remote_ifindex * is 0 here, so no matches. */ if (dst->remote_ifindex == dev->ifindex) vxlan_dellink(vxlan->dev, &list_kill); } unregister_netdevice_many(&list_kill); } static int vxlan_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); if (event == NETDEV_UNREGISTER) vxlan_handle_lowerdev_unregister(vn, dev); else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO) vxlan_offload_rx_ports(dev, true); else if (event == NETDEV_UDP_TUNNEL_DROP_INFO) vxlan_offload_rx_ports(dev, false); return NOTIFY_DONE; } static struct notifier_block vxlan_notifier_block __read_mostly = { .notifier_call = vxlan_netdevice_event, }; static void vxlan_fdb_offloaded_set(struct net_device *dev, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst; struct vxlan_fdb *f; u32 hash_index; hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) goto out; rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip, fdb_info->remote_port, fdb_info->remote_vni, fdb_info->remote_ifindex); if (!rdst) goto out; rdst->offloaded = fdb_info->offloaded; out: spin_unlock_bh(&vxlan->hash_lock[hash_index]); } static int vxlan_fdb_external_learn_add(struct net_device *dev, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); struct netlink_ext_ack *extack; u32 hash_index; int err; hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); extack = switchdev_notifier_info_to_extack(&fdb_info->info); spin_lock_bh(&vxlan->hash_lock[hash_index]); err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip, NUD_REACHABLE, NLM_F_CREATE | NLM_F_REPLACE, fdb_info->remote_port, fdb_info->vni, fdb_info->remote_vni, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, 0, false, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; } static int vxlan_fdb_external_learn_del(struct net_device *dev, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; u32 hash_index; int err = 0; hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) err = -ENOENT; else if (f->flags & NTF_EXT_LEARNED) err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr, fdb_info->remote_ip, fdb_info->remote_port, fdb_info->vni, fdb_info->remote_vni, fdb_info->remote_ifindex, false); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; } static int vxlan_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct switchdev_notifier_vxlan_fdb_info *fdb_info; int err = 0; switch (event) { case SWITCHDEV_VXLAN_FDB_OFFLOADED: vxlan_fdb_offloaded_set(dev, ptr); break; case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE: fdb_info = ptr; err = vxlan_fdb_external_learn_add(dev, fdb_info); if (err) { err = notifier_from_errno(err); break; } fdb_info->offloaded = true; vxlan_fdb_offloaded_set(dev, fdb_info); break; case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE: fdb_info = ptr; err = vxlan_fdb_external_learn_del(dev, fdb_info); if (err) { err = notifier_from_errno(err); break; } fdb_info->offloaded = false; vxlan_fdb_offloaded_set(dev, fdb_info); break; } return err; } static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = { .notifier_call = vxlan_switchdev_event, }; static void vxlan_fdb_nh_flush(struct nexthop *nh) { struct vxlan_fdb *fdb; struct vxlan_dev *vxlan; u32 hash_index; rcu_read_lock(); list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) { vxlan = rcu_dereference(fdb->vdev); WARN_ON(!vxlan); hash_index = fdb_head_index(vxlan, fdb->eth_addr, vxlan->default_dst.remote_vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); if (!hlist_unhashed(&fdb->hlist)) vxlan_fdb_destroy(vxlan, fdb, false, false); spin_unlock_bh(&vxlan->hash_lock[hash_index]); } rcu_read_unlock(); } static int vxlan_nexthop_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct nh_notifier_info *info = ptr; struct nexthop *nh; if (event != NEXTHOP_EVENT_DEL) return NOTIFY_DONE; nh = nexthop_find_by_id(info->net, info->id); if (!nh) return NOTIFY_DONE; vxlan_fdb_nh_flush(nh); return NOTIFY_DONE; } static __net_init int vxlan_init_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int h; INIT_LIST_HEAD(&vn->vxlan_list); spin_lock_init(&vn->sock_lock); vn->nexthop_notifier_block.notifier_call = vxlan_nexthop_event; for (h = 0; h < PORT_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vn->sock_list[h]); return register_nexthop_notifier(net, &vn->nexthop_notifier_block, NULL); } static void __net_exit vxlan_destroy_tunnels(struct vxlan_net *vn, struct list_head *dev_to_kill) { struct vxlan_dev *vxlan, *next; list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) vxlan_dellink(vxlan->dev, dev_to_kill); } static void __net_exit vxlan_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); __unregister_nexthop_notifier(net, &vn->nexthop_notifier_block); vxlan_destroy_tunnels(vn, dev_to_kill); } } static void __net_exit vxlan_exit_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int h; for (h = 0; h < PORT_HASH_SIZE; ++h) WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); } static struct pernet_operations vxlan_net_ops = { .init = vxlan_init_net, .exit_batch_rtnl = vxlan_exit_batch_rtnl, .exit = vxlan_exit_net, .id = &vxlan_net_id, .size = sizeof(struct vxlan_net), }; static int __init vxlan_init_module(void) { int rc; get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); rc = register_pernet_subsys(&vxlan_net_ops); if (rc) goto out1; rc = register_netdevice_notifier(&vxlan_notifier_block); if (rc) goto out2; rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block); if (rc) goto out3; rc = rtnl_link_register(&vxlan_link_ops); if (rc) goto out4; rc = vxlan_vnifilter_init(); if (rc) goto out5; return 0; out5: rtnl_link_unregister(&vxlan_link_ops); out4: unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); out3: unregister_netdevice_notifier(&vxlan_notifier_block); out2: unregister_pernet_subsys(&vxlan_net_ops); out1: return rc; } late_initcall(vxlan_init_module); static void __exit vxlan_cleanup_module(void) { vxlan_vnifilter_uninit(); rtnl_link_unregister(&vxlan_link_ops); unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); unregister_netdevice_notifier(&vxlan_notifier_block); unregister_pernet_subsys(&vxlan_net_ops); /* rcu_barrier() is called by netns */ } module_exit(vxlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_VERSION(VXLAN_VERSION); MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>"); MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic"); MODULE_ALIAS_RTNL_LINK("vxlan"); |
18 500 31 498 101 103 101 103 102 100 102 103 101 100 101 102 20 100 101 103 101 185 184 483 3 184 185 186 186 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 | // SPDX-License-Identifier: GPL-2.0 /* * drivers/base/power/runtime.c - Helper functions for device runtime PM * * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. * Copyright (C) 2010 Alan Stern <stern@rowland.harvard.edu> */ #include <linux/sched/mm.h> #include <linux/ktime.h> #include <linux/hrtimer.h> #include <linux/export.h> #include <linux/pm_runtime.h> #include <linux/pm_wakeirq.h> #include <linux/rculist.h> #include <trace/events/rpm.h> #include "../base.h" #include "power.h" typedef int (*pm_callback_t)(struct device *); static pm_callback_t __rpm_get_callback(struct device *dev, size_t cb_offset) { pm_callback_t cb; const struct dev_pm_ops *ops; if (dev->pm_domain) ops = &dev->pm_domain->ops; else if (dev->type && dev->type->pm) ops = dev->type->pm; else if (dev->class && dev->class->pm) ops = dev->class->pm; else if (dev->bus && dev->bus->pm) ops = dev->bus->pm; else ops = NULL; if (ops) cb = *(pm_callback_t *)((void *)ops + cb_offset); else cb = NULL; if (!cb && dev->driver && dev->driver->pm) cb = *(pm_callback_t *)((void *)dev->driver->pm + cb_offset); return cb; } #define RPM_GET_CALLBACK(dev, callback) \ __rpm_get_callback(dev, offsetof(struct dev_pm_ops, callback)) static int rpm_resume(struct device *dev, int rpmflags); static int rpm_suspend(struct device *dev, int rpmflags); /** * update_pm_runtime_accounting - Update the time accounting of power states * @dev: Device to update the accounting for * * In order to be able to have time accounting of the various power states * (as used by programs such as PowerTOP to show the effectiveness of runtime * PM), we need to track the time spent in each state. * update_pm_runtime_accounting must be called each time before the * runtime_status field is updated, to account the time in the old state * correctly. */ static void update_pm_runtime_accounting(struct device *dev) { u64 now, last, delta; if (dev->power.disable_depth > 0) return; last = dev->power.accounting_timestamp; now = ktime_get_mono_fast_ns(); dev->power.accounting_timestamp = now; /* * Because ktime_get_mono_fast_ns() is not monotonic during * timekeeping updates, ensure that 'now' is after the last saved * timesptamp. */ if (now < last) return; delta = now - last; if (dev->power.runtime_status == RPM_SUSPENDED) dev->power.suspended_time += delta; else dev->power.active_time += delta; } static void __update_runtime_status(struct device *dev, enum rpm_status status) { update_pm_runtime_accounting(dev); trace_rpm_status(dev, status); dev->power.runtime_status = status; } static u64 rpm_get_accounted_time(struct device *dev, bool suspended) { u64 time; unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); update_pm_runtime_accounting(dev); time = suspended ? dev->power.suspended_time : dev->power.active_time; spin_unlock_irqrestore(&dev->power.lock, flags); return time; } u64 pm_runtime_active_time(struct device *dev) { return rpm_get_accounted_time(dev, false); } u64 pm_runtime_suspended_time(struct device *dev) { return rpm_get_accounted_time(dev, true); } EXPORT_SYMBOL_GPL(pm_runtime_suspended_time); /** * pm_runtime_deactivate_timer - Deactivate given device's suspend timer. * @dev: Device to handle. */ static void pm_runtime_deactivate_timer(struct device *dev) { if (dev->power.timer_expires > 0) { hrtimer_try_to_cancel(&dev->power.suspend_timer); dev->power.timer_expires = 0; } } /** * pm_runtime_cancel_pending - Deactivate suspend timer and cancel requests. * @dev: Device to handle. */ static void pm_runtime_cancel_pending(struct device *dev) { pm_runtime_deactivate_timer(dev); /* * In case there's a request pending, make sure its work function will * return without doing anything. */ dev->power.request = RPM_REQ_NONE; } /* * pm_runtime_autosuspend_expiration - Get a device's autosuspend-delay expiration time. * @dev: Device to handle. * * Compute the autosuspend-delay expiration time based on the device's * power.last_busy time. If the delay has already expired or is disabled * (negative) or the power.use_autosuspend flag isn't set, return 0. * Otherwise return the expiration time in nanoseconds (adjusted to be nonzero). * * This function may be called either with or without dev->power.lock held. * Either way it can be racy, since power.last_busy may be updated at any time. */ u64 pm_runtime_autosuspend_expiration(struct device *dev) { int autosuspend_delay; u64 expires; if (!dev->power.use_autosuspend) return 0; autosuspend_delay = READ_ONCE(dev->power.autosuspend_delay); if (autosuspend_delay < 0) return 0; expires = READ_ONCE(dev->power.last_busy); expires += (u64)autosuspend_delay * NSEC_PER_MSEC; if (expires > ktime_get_mono_fast_ns()) return expires; /* Expires in the future */ return 0; } EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration); static int dev_memalloc_noio(struct device *dev, void *data) { return dev->power.memalloc_noio; } /* * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag. * @dev: Device to handle. * @enable: True for setting the flag and False for clearing the flag. * * Set the flag for all devices in the path from the device to the * root device in the device tree if @enable is true, otherwise clear * the flag for devices in the path whose siblings don't set the flag. * * The function should only be called by block device, or network * device driver for solving the deadlock problem during runtime * resume/suspend: * * If memory allocation with GFP_KERNEL is called inside runtime * resume/suspend callback of any one of its ancestors(or the * block device itself), the deadlock may be triggered inside the * memory allocation since it might not complete until the block * device becomes active and the involed page I/O finishes. The * situation is pointed out first by Alan Stern. Network device * are involved in iSCSI kind of situation. * * The lock of dev_hotplug_mutex is held in the function for handling * hotplug race because pm_runtime_set_memalloc_noio() may be called * in async probe(). * * The function should be called between device_add() and device_del() * on the affected device(block/network device). */ void pm_runtime_set_memalloc_noio(struct device *dev, bool enable) { static DEFINE_MUTEX(dev_hotplug_mutex); mutex_lock(&dev_hotplug_mutex); for (;;) { bool enabled; /* hold power lock since bitfield is not SMP-safe. */ spin_lock_irq(&dev->power.lock); enabled = dev->power.memalloc_noio; dev->power.memalloc_noio = enable; spin_unlock_irq(&dev->power.lock); /* * not need to enable ancestors any more if the device * has been enabled. */ if (enabled && enable) break; dev = dev->parent; /* * clear flag of the parent device only if all the * children don't set the flag because ancestor's * flag was set by any one of the descendants. */ if (!dev || (!enable && device_for_each_child(dev, NULL, dev_memalloc_noio))) break; } mutex_unlock(&dev_hotplug_mutex); } EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio); /** * rpm_check_suspend_allowed - Test whether a device may be suspended. * @dev: Device to test. */ static int rpm_check_suspend_allowed(struct device *dev) { int retval = 0; if (dev->power.runtime_error) retval = -EINVAL; else if (dev->power.disable_depth > 0) retval = -EACCES; else if (atomic_read(&dev->power.usage_count)) retval = -EAGAIN; else if (!dev->power.ignore_children && atomic_read(&dev->power.child_count)) retval = -EBUSY; /* Pending resume requests take precedence over suspends. */ else if ((dev->power.deferred_resume && dev->power.runtime_status == RPM_SUSPENDING) || (dev->power.request_pending && dev->power.request == RPM_REQ_RESUME)) retval = -EAGAIN; else if (__dev_pm_qos_resume_latency(dev) == 0) retval = -EPERM; else if (dev->power.runtime_status == RPM_SUSPENDED) retval = 1; return retval; } static int rpm_get_suppliers(struct device *dev) { struct device_link *link; list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) { int retval; if (!(link->flags & DL_FLAG_PM_RUNTIME)) continue; retval = pm_runtime_get_sync(link->supplier); /* Ignore suppliers with disabled runtime PM. */ if (retval < 0 && retval != -EACCES) { pm_runtime_put_noidle(link->supplier); return retval; } refcount_inc(&link->rpm_active); } return 0; } /** * pm_runtime_release_supplier - Drop references to device link's supplier. * @link: Target device link. * * Drop all runtime PM references associated with @link to its supplier device. */ void pm_runtime_release_supplier(struct device_link *link) { struct device *supplier = link->supplier; /* * The additional power.usage_count check is a safety net in case * the rpm_active refcount becomes saturated, in which case * refcount_dec_not_one() would return true forever, but it is not * strictly necessary. */ while (refcount_dec_not_one(&link->rpm_active) && atomic_read(&supplier->power.usage_count) > 0) pm_runtime_put_noidle(supplier); } static void __rpm_put_suppliers(struct device *dev, bool try_to_suspend) { struct device_link *link; list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) { pm_runtime_release_supplier(link); if (try_to_suspend) pm_request_idle(link->supplier); } } static void rpm_put_suppliers(struct device *dev) { __rpm_put_suppliers(dev, true); } static void rpm_suspend_suppliers(struct device *dev) { struct device_link *link; int idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) pm_request_idle(link->supplier); device_links_read_unlock(idx); } /** * __rpm_callback - Run a given runtime PM callback for a given device. * @cb: Runtime PM callback to run. * @dev: Device to run the callback for. */ static int __rpm_callback(int (*cb)(struct device *), struct device *dev) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int retval = 0, idx; bool use_links = dev->power.links_count > 0; if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); } else { spin_unlock_irq(&dev->power.lock); /* * Resume suppliers if necessary. * * The device's runtime PM status cannot change until this * routine returns, so it is safe to read the status outside of * the lock. */ if (use_links && dev->power.runtime_status == RPM_RESUMING) { idx = device_links_read_lock(); retval = rpm_get_suppliers(dev); if (retval) { rpm_put_suppliers(dev); goto fail; } device_links_read_unlock(idx); } } if (cb) retval = cb(dev); if (dev->power.irq_safe) { spin_lock(&dev->power.lock); } else { /* * If the device is suspending and the callback has returned * success, drop the usage counters of the suppliers that have * been reference counted on its resume. * * Do that if resume fails too. */ if (use_links && ((dev->power.runtime_status == RPM_SUSPENDING && !retval) || (dev->power.runtime_status == RPM_RESUMING && retval))) { idx = device_links_read_lock(); __rpm_put_suppliers(dev, false); fail: device_links_read_unlock(idx); } spin_lock_irq(&dev->power.lock); } return retval; } /** * rpm_callback - Run a given runtime PM callback for a given device. * @cb: Runtime PM callback to run. * @dev: Device to run the callback for. */ static int rpm_callback(int (*cb)(struct device *), struct device *dev) { int retval; if (dev->power.memalloc_noio) { unsigned int noio_flag; /* * Deadlock might be caused if memory allocation with * GFP_KERNEL happens inside runtime_suspend and * runtime_resume callbacks of one block device's * ancestor or the block device itself. Network * device might be thought as part of iSCSI block * device, so network device and its ancestor should * be marked as memalloc_noio too. */ noio_flag = memalloc_noio_save(); retval = __rpm_callback(cb, dev); memalloc_noio_restore(noio_flag); } else { retval = __rpm_callback(cb, dev); } dev->power.runtime_error = retval; return retval != -EACCES ? retval : -EIO; } /** * rpm_idle - Notify device bus type if the device can be suspended. * @dev: Device to notify the bus type about. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be suspended. If * another idle notification has been started earlier, return immediately. If * the RPM_ASYNC flag is set then queue an idle-notification request; otherwise * run the ->runtime_idle() callback directly. If the ->runtime_idle callback * doesn't exist or if it returns 0, call rpm_suspend with the RPM_AUTO flag. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_idle(struct device *dev, int rpmflags) { int (*callback)(struct device *); int retval; trace_rpm_idle(dev, rpmflags); retval = rpm_check_suspend_allowed(dev); if (retval < 0) ; /* Conditions are wrong. */ /* Idle notifications are allowed only in the RPM_ACTIVE state. */ else if (dev->power.runtime_status != RPM_ACTIVE) retval = -EAGAIN; /* * Any pending request other than an idle notification takes * precedence over us, except that the timer may be running. */ else if (dev->power.request_pending && dev->power.request > RPM_REQ_IDLE) retval = -EAGAIN; /* Act as though RPM_NOWAIT is always set. */ else if (dev->power.idle_notification) retval = -EINPROGRESS; if (retval) goto out; /* Pending requests need to be canceled. */ dev->power.request = RPM_REQ_NONE; callback = RPM_GET_CALLBACK(dev, runtime_idle); /* If no callback assume success. */ if (!callback || dev->power.no_callbacks) goto out; /* Carry out an asynchronous or a synchronous idle notification. */ if (rpmflags & RPM_ASYNC) { dev->power.request = RPM_REQ_IDLE; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } trace_rpm_return_int(dev, _THIS_IP_, 0); return 0; } dev->power.idle_notification = true; if (dev->power.irq_safe) spin_unlock(&dev->power.lock); else spin_unlock_irq(&dev->power.lock); retval = callback(dev); if (dev->power.irq_safe) spin_lock(&dev->power.lock); else spin_lock_irq(&dev->power.lock); dev->power.idle_notification = false; wake_up_all(&dev->power.wait_queue); out: trace_rpm_return_int(dev, _THIS_IP_, retval); return retval ? retval : rpm_suspend(dev, rpmflags | RPM_AUTO); } /** * rpm_suspend - Carry out runtime suspend of given device. * @dev: Device to suspend. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be suspended. * Cancel a pending idle notification, autosuspend or suspend. If * another suspend has been started earlier, either return immediately * or wait for it to finish, depending on the RPM_NOWAIT and RPM_ASYNC * flags. If the RPM_ASYNC flag is set then queue a suspend request; * otherwise run the ->runtime_suspend() callback directly. When * ->runtime_suspend succeeded, if a deferred resume was requested while * the callback was running then carry it out, otherwise send an idle * notification for its parent (if the suspend succeeded and both * ignore_children of parent->power and irq_safe of dev->power are not set). * If ->runtime_suspend failed with -EAGAIN or -EBUSY, and if the RPM_AUTO * flag is set and the next autosuspend-delay expiration time is in the * future, schedule another autosuspend attempt. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_suspend(struct device *dev, int rpmflags) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int (*callback)(struct device *); struct device *parent = NULL; int retval; trace_rpm_suspend(dev, rpmflags); repeat: retval = rpm_check_suspend_allowed(dev); if (retval < 0) goto out; /* Conditions are wrong. */ /* Synchronous suspends are not allowed in the RPM_RESUMING state. */ if (dev->power.runtime_status == RPM_RESUMING && !(rpmflags & RPM_ASYNC)) retval = -EAGAIN; if (retval) goto out; /* If the autosuspend_delay time hasn't expired yet, reschedule. */ if ((rpmflags & RPM_AUTO) && dev->power.runtime_status != RPM_SUSPENDING) { u64 expires = pm_runtime_autosuspend_expiration(dev); if (expires != 0) { /* Pending requests need to be canceled. */ dev->power.request = RPM_REQ_NONE; /* * Optimization: If the timer is already running and is * set to expire at or before the autosuspend delay, * avoid the overhead of resetting it. Just let it * expire; pm_suspend_timer_fn() will take care of the * rest. */ if (!(dev->power.timer_expires && dev->power.timer_expires <= expires)) { /* * We add a slack of 25% to gather wakeups * without sacrificing the granularity. */ u64 slack = (u64)READ_ONCE(dev->power.autosuspend_delay) * (NSEC_PER_MSEC >> 2); dev->power.timer_expires = expires; hrtimer_start_range_ns(&dev->power.suspend_timer, ns_to_ktime(expires), slack, HRTIMER_MODE_ABS); } dev->power.timer_autosuspends = 1; goto out; } } /* Other scheduled or pending requests need to be canceled. */ pm_runtime_cancel_pending(dev); if (dev->power.runtime_status == RPM_SUSPENDING) { DEFINE_WAIT(wait); if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) { retval = -EINPROGRESS; goto out; } if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); cpu_relax(); spin_lock(&dev->power.lock); goto repeat; } /* Wait for the other suspend running in parallel with us. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_SUSPENDING) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); goto repeat; } if (dev->power.no_callbacks) goto no_callback; /* Assume success. */ /* Carry out an asynchronous or a synchronous suspend. */ if (rpmflags & RPM_ASYNC) { dev->power.request = (rpmflags & RPM_AUTO) ? RPM_REQ_AUTOSUSPEND : RPM_REQ_SUSPEND; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } goto out; } __update_runtime_status(dev, RPM_SUSPENDING); callback = RPM_GET_CALLBACK(dev, runtime_suspend); dev_pm_enable_wake_irq_check(dev, true); retval = rpm_callback(callback, dev); if (retval) goto fail; dev_pm_enable_wake_irq_complete(dev); no_callback: __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_deactivate_timer(dev); if (dev->parent) { parent = dev->parent; atomic_add_unless(&parent->power.child_count, -1, 0); } wake_up_all(&dev->power.wait_queue); if (dev->power.deferred_resume) { dev->power.deferred_resume = false; rpm_resume(dev, 0); retval = -EAGAIN; goto out; } if (dev->power.irq_safe) goto out; /* Maybe the parent is now able to suspend. */ if (parent && !parent->power.ignore_children) { spin_unlock(&dev->power.lock); spin_lock(&parent->power.lock); rpm_idle(parent, RPM_ASYNC); spin_unlock(&parent->power.lock); spin_lock(&dev->power.lock); } /* Maybe the suppliers are now able to suspend. */ if (dev->power.links_count > 0) { spin_unlock_irq(&dev->power.lock); rpm_suspend_suppliers(dev); spin_lock_irq(&dev->power.lock); } out: trace_rpm_return_int(dev, _THIS_IP_, retval); return retval; fail: dev_pm_disable_wake_irq_check(dev, true); __update_runtime_status(dev, RPM_ACTIVE); dev->power.deferred_resume = false; wake_up_all(&dev->power.wait_queue); if (retval == -EAGAIN || retval == -EBUSY) { dev->power.runtime_error = 0; /* * If the callback routine failed an autosuspend, and * if the last_busy time has been updated so that there * is a new autosuspend expiration time, automatically * reschedule another autosuspend. */ if ((rpmflags & RPM_AUTO) && pm_runtime_autosuspend_expiration(dev) != 0) goto repeat; } else { pm_runtime_cancel_pending(dev); } goto out; } /** * rpm_resume - Carry out runtime resume of given device. * @dev: Device to resume. * @rpmflags: Flag bits. * * Check if the device's runtime PM status allows it to be resumed. Cancel * any scheduled or pending requests. If another resume has been started * earlier, either return immediately or wait for it to finish, depending on the * RPM_NOWAIT and RPM_ASYNC flags. Similarly, if there's a suspend running in * parallel with this function, either tell the other process to resume after * suspending (deferred_resume) or wait for it to finish. If the RPM_ASYNC * flag is set then queue a resume request; otherwise run the * ->runtime_resume() callback directly. Queue an idle notification for the * device if the resume succeeded. * * This function must be called under dev->power.lock with interrupts disabled. */ static int rpm_resume(struct device *dev, int rpmflags) __releases(&dev->power.lock) __acquires(&dev->power.lock) { int (*callback)(struct device *); struct device *parent = NULL; int retval = 0; trace_rpm_resume(dev, rpmflags); repeat: if (dev->power.runtime_error) { retval = -EINVAL; } else if (dev->power.disable_depth > 0) { if (dev->power.runtime_status == RPM_ACTIVE && dev->power.last_status == RPM_ACTIVE) retval = 1; else retval = -EACCES; } if (retval) goto out; /* * Other scheduled or pending requests need to be canceled. Small * optimization: If an autosuspend timer is running, leave it running * rather than cancelling it now only to restart it again in the near * future. */ dev->power.request = RPM_REQ_NONE; if (!dev->power.timer_autosuspends) pm_runtime_deactivate_timer(dev); if (dev->power.runtime_status == RPM_ACTIVE) { retval = 1; goto out; } if (dev->power.runtime_status == RPM_RESUMING || dev->power.runtime_status == RPM_SUSPENDING) { DEFINE_WAIT(wait); if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) { if (dev->power.runtime_status == RPM_SUSPENDING) { dev->power.deferred_resume = true; if (rpmflags & RPM_NOWAIT) retval = -EINPROGRESS; } else { retval = -EINPROGRESS; } goto out; } if (dev->power.irq_safe) { spin_unlock(&dev->power.lock); cpu_relax(); spin_lock(&dev->power.lock); goto repeat; } /* Wait for the operation carried out in parallel with us. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_RESUMING && dev->power.runtime_status != RPM_SUSPENDING) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); goto repeat; } /* * See if we can skip waking up the parent. This is safe only if * power.no_callbacks is set, because otherwise we don't know whether * the resume will actually succeed. */ if (dev->power.no_callbacks && !parent && dev->parent) { spin_lock_nested(&dev->parent->power.lock, SINGLE_DEPTH_NESTING); if (dev->parent->power.disable_depth > 0 || dev->parent->power.ignore_children || dev->parent->power.runtime_status == RPM_ACTIVE) { atomic_inc(&dev->parent->power.child_count); spin_unlock(&dev->parent->power.lock); retval = 1; goto no_callback; /* Assume success. */ } spin_unlock(&dev->parent->power.lock); } /* Carry out an asynchronous or a synchronous resume. */ if (rpmflags & RPM_ASYNC) { dev->power.request = RPM_REQ_RESUME; if (!dev->power.request_pending) { dev->power.request_pending = true; queue_work(pm_wq, &dev->power.work); } retval = 0; goto out; } if (!parent && dev->parent) { /* * Increment the parent's usage counter and resume it if * necessary. Not needed if dev is irq-safe; then the * parent is permanently resumed. */ parent = dev->parent; if (dev->power.irq_safe) goto skip_parent; spin_unlock(&dev->power.lock); pm_runtime_get_noresume(parent); spin_lock(&parent->power.lock); /* * Resume the parent if it has runtime PM enabled and not been * set to ignore its children. */ if (!parent->power.disable_depth && !parent->power.ignore_children) { rpm_resume(parent, 0); if (parent->power.runtime_status != RPM_ACTIVE) retval = -EBUSY; } spin_unlock(&parent->power.lock); spin_lock(&dev->power.lock); if (retval) goto out; goto repeat; } skip_parent: if (dev->power.no_callbacks) goto no_callback; /* Assume success. */ __update_runtime_status(dev, RPM_RESUMING); callback = RPM_GET_CALLBACK(dev, runtime_resume); dev_pm_disable_wake_irq_check(dev, false); retval = rpm_callback(callback, dev); if (retval) { __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_cancel_pending(dev); dev_pm_enable_wake_irq_check(dev, false); } else { no_callback: __update_runtime_status(dev, RPM_ACTIVE); pm_runtime_mark_last_busy(dev); if (parent) atomic_inc(&parent->power.child_count); } wake_up_all(&dev->power.wait_queue); if (retval >= 0) rpm_idle(dev, RPM_ASYNC); out: if (parent && !dev->power.irq_safe) { spin_unlock_irq(&dev->power.lock); pm_runtime_put(parent); spin_lock_irq(&dev->power.lock); } trace_rpm_return_int(dev, _THIS_IP_, retval); return retval; } /** * pm_runtime_work - Universal runtime PM work function. * @work: Work structure used for scheduling the execution of this function. * * Use @work to get the device object the work is to be done for, determine what * is to be done and execute the appropriate runtime PM function. */ static void pm_runtime_work(struct work_struct *work) { struct device *dev = container_of(work, struct device, power.work); enum rpm_request req; spin_lock_irq(&dev->power.lock); if (!dev->power.request_pending) goto out; req = dev->power.request; dev->power.request = RPM_REQ_NONE; dev->power.request_pending = false; switch (req) { case RPM_REQ_NONE: break; case RPM_REQ_IDLE: rpm_idle(dev, RPM_NOWAIT); break; case RPM_REQ_SUSPEND: rpm_suspend(dev, RPM_NOWAIT); break; case RPM_REQ_AUTOSUSPEND: rpm_suspend(dev, RPM_NOWAIT | RPM_AUTO); break; case RPM_REQ_RESUME: rpm_resume(dev, RPM_NOWAIT); break; } out: spin_unlock_irq(&dev->power.lock); } /** * pm_suspend_timer_fn - Timer function for pm_schedule_suspend(). * @timer: hrtimer used by pm_schedule_suspend(). * * Check if the time is right and queue a suspend request. */ static enum hrtimer_restart pm_suspend_timer_fn(struct hrtimer *timer) { struct device *dev = container_of(timer, struct device, power.suspend_timer); unsigned long flags; u64 expires; spin_lock_irqsave(&dev->power.lock, flags); expires = dev->power.timer_expires; /* * If 'expires' is after the current time, we've been called * too early. */ if (expires > 0 && expires < ktime_get_mono_fast_ns()) { dev->power.timer_expires = 0; rpm_suspend(dev, dev->power.timer_autosuspends ? (RPM_ASYNC | RPM_AUTO) : RPM_ASYNC); } spin_unlock_irqrestore(&dev->power.lock, flags); return HRTIMER_NORESTART; } /** * pm_schedule_suspend - Set up a timer to submit a suspend request in future. * @dev: Device to suspend. * @delay: Time to wait before submitting a suspend request, in milliseconds. */ int pm_schedule_suspend(struct device *dev, unsigned int delay) { unsigned long flags; u64 expires; int retval; spin_lock_irqsave(&dev->power.lock, flags); if (!delay) { retval = rpm_suspend(dev, RPM_ASYNC); goto out; } retval = rpm_check_suspend_allowed(dev); if (retval) goto out; /* Other scheduled or pending requests need to be canceled. */ pm_runtime_cancel_pending(dev); expires = ktime_get_mono_fast_ns() + (u64)delay * NSEC_PER_MSEC; dev->power.timer_expires = expires; dev->power.timer_autosuspends = 0; hrtimer_start(&dev->power.suspend_timer, expires, HRTIMER_MODE_ABS); out: spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(pm_schedule_suspend); static int rpm_drop_usage_count(struct device *dev) { int ret; ret = atomic_sub_return(1, &dev->power.usage_count); if (ret >= 0) return ret; /* * Because rpm_resume() does not check the usage counter, it will resume * the device even if the usage counter is 0 or negative, so it is * sufficient to increment the usage counter here to reverse the change * made above. */ atomic_inc(&dev->power.usage_count); dev_warn(dev, "Runtime PM usage count underflow!\n"); return -EINVAL; } /** * __pm_runtime_idle - Entry point for runtime idle operations. * @dev: Device to send idle notification for. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, decrement the device's usage count and * return immediately if it is larger than zero (if it becomes negative, log a * warning, increment it, and return an error). Then carry out an idle * notification, either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_idle(struct device *dev, int rpmflags) { unsigned long flags; int retval; if (rpmflags & RPM_GET_PUT) { retval = rpm_drop_usage_count(dev); if (retval < 0) { return retval; } else if (retval > 0) { trace_rpm_usage(dev, rpmflags); return 0; } } might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_idle(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_idle); /** * __pm_runtime_suspend - Entry point for runtime put/suspend operations. * @dev: Device to suspend. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, decrement the device's usage count and * return immediately if it is larger than zero (if it becomes negative, log a * warning, increment it, and return an error). Then carry out a suspend, * either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_suspend(struct device *dev, int rpmflags) { unsigned long flags; int retval; if (rpmflags & RPM_GET_PUT) { retval = rpm_drop_usage_count(dev); if (retval < 0) { return retval; } else if (retval > 0) { trace_rpm_usage(dev, rpmflags); return 0; } } might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_suspend(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_suspend); /** * __pm_runtime_resume - Entry point for runtime resume operations. * @dev: Device to resume. * @rpmflags: Flag bits. * * If the RPM_GET_PUT flag is set, increment the device's usage count. Then * carry out a resume, either synchronous or asynchronous. * * This routine may be called in atomic context if the RPM_ASYNC flag is set, * or if pm_runtime_irq_safe() has been called. */ int __pm_runtime_resume(struct device *dev, int rpmflags) { unsigned long flags; int retval; might_sleep_if(!(rpmflags & RPM_ASYNC) && !dev->power.irq_safe && dev->power.runtime_status != RPM_ACTIVE); if (rpmflags & RPM_GET_PUT) atomic_inc(&dev->power.usage_count); spin_lock_irqsave(&dev->power.lock, flags); retval = rpm_resume(dev, rpmflags); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } EXPORT_SYMBOL_GPL(__pm_runtime_resume); /** * pm_runtime_get_conditional - Conditionally bump up device usage counter. * @dev: Device to handle. * @ign_usage_count: Whether or not to look at the current usage counter value. * * Return -EINVAL if runtime PM is disabled for @dev. * * Otherwise, if the runtime PM status of @dev is %RPM_ACTIVE and either * @ign_usage_count is %true or the runtime PM usage counter of @dev is not * zero, increment the usage counter of @dev and return 1. Otherwise, return 0 * without changing the usage counter. * * If @ign_usage_count is %true, this function can be used to prevent suspending * the device when its runtime PM status is %RPM_ACTIVE. * * If @ign_usage_count is %false, this function can be used to prevent * suspending the device when both its runtime PM status is %RPM_ACTIVE and its * runtime PM usage counter is not zero. * * The caller is responsible for decrementing the runtime PM usage counter of * @dev after this function has returned a positive value for it. */ static int pm_runtime_get_conditional(struct device *dev, bool ign_usage_count) { unsigned long flags; int retval; spin_lock_irqsave(&dev->power.lock, flags); if (dev->power.disable_depth > 0) { retval = -EINVAL; } else if (dev->power.runtime_status != RPM_ACTIVE) { retval = 0; } else if (ign_usage_count) { retval = 1; atomic_inc(&dev->power.usage_count); } else { retval = atomic_inc_not_zero(&dev->power.usage_count); } trace_rpm_usage(dev, 0); spin_unlock_irqrestore(&dev->power.lock, flags); return retval; } /** * pm_runtime_get_if_active - Bump up runtime PM usage counter if the device is * in active state * @dev: Target device. * * Increment the runtime PM usage counter of @dev if its runtime PM status is * %RPM_ACTIVE, in which case it returns 1. If the device is in a different * state, 0 is returned. -EINVAL is returned if runtime PM is disabled for the * device, in which case also the usage_count will remain unmodified. */ int pm_runtime_get_if_active(struct device *dev) { return pm_runtime_get_conditional(dev, true); } EXPORT_SYMBOL_GPL(pm_runtime_get_if_active); /** * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. * @dev: Target device. * * Increment the runtime PM usage counter of @dev if its runtime PM status is * %RPM_ACTIVE and its runtime PM usage counter is greater than 0, in which case * it returns 1. If the device is in a different state or its usage_count is 0, * 0 is returned. -EINVAL is returned if runtime PM is disabled for the device, * in which case also the usage_count will remain unmodified. */ int pm_runtime_get_if_in_use(struct device *dev) { return pm_runtime_get_conditional(dev, false); } EXPORT_SYMBOL_GPL(pm_runtime_get_if_in_use); /** * __pm_runtime_set_status - Set runtime PM status of a device. * @dev: Device to handle. * @status: New runtime PM status of the device. * * If runtime PM of the device is disabled or its power.runtime_error field is * different from zero, the status may be changed either to RPM_ACTIVE, or to * RPM_SUSPENDED, as long as that reflects the actual state of the device. * However, if the device has a parent and the parent is not active, and the * parent's power.ignore_children flag is unset, the device's status cannot be * set to RPM_ACTIVE, so -EBUSY is returned in that case. * * If successful, __pm_runtime_set_status() clears the power.runtime_error field * and the device parent's counter of unsuspended children is modified to * reflect the new status. If the new status is RPM_SUSPENDED, an idle * notification request for the parent is submitted. * * If @dev has any suppliers (as reflected by device links to them), and @status * is RPM_ACTIVE, they will be activated upfront and if the activation of one * of them fails, the status of @dev will be changed to RPM_SUSPENDED (instead * of the @status value) and the suppliers will be deacticated on exit. The * error returned by the failing supplier activation will be returned in that * case. */ int __pm_runtime_set_status(struct device *dev, unsigned int status) { struct device *parent = dev->parent; bool notify_parent = false; unsigned long flags; int error = 0; if (status != RPM_ACTIVE && status != RPM_SUSPENDED) return -EINVAL; spin_lock_irqsave(&dev->power.lock, flags); /* * Prevent PM-runtime from being enabled for the device or return an * error if it is enabled already and working. */ if (dev->power.runtime_error || dev->power.disable_depth) dev->power.disable_depth++; else error = -EAGAIN; spin_unlock_irqrestore(&dev->power.lock, flags); if (error) return error; /* * If the new status is RPM_ACTIVE, the suppliers can be activated * upfront regardless of the current status, because next time * rpm_put_suppliers() runs, the rpm_active refcounts of the links * involved will be dropped down to one anyway. */ if (status == RPM_ACTIVE) { int idx = device_links_read_lock(); error = rpm_get_suppliers(dev); if (error) status = RPM_SUSPENDED; device_links_read_unlock(idx); } spin_lock_irqsave(&dev->power.lock, flags); if (dev->power.runtime_status == status || !parent) goto out_set; if (status == RPM_SUSPENDED) { atomic_add_unless(&parent->power.child_count, -1, 0); notify_parent = !parent->power.ignore_children; } else { spin_lock_nested(&parent->power.lock, SINGLE_DEPTH_NESTING); /* * It is invalid to put an active child under a parent that is * not active, has runtime PM enabled and the * 'power.ignore_children' flag unset. */ if (!parent->power.disable_depth && !parent->power.ignore_children && parent->power.runtime_status != RPM_ACTIVE) { dev_err(dev, "runtime PM trying to activate child device %s but parent (%s) is not active\n", dev_name(dev), dev_name(parent)); error = -EBUSY; } else if (dev->power.runtime_status == RPM_SUSPENDED) { atomic_inc(&parent->power.child_count); } spin_unlock(&parent->power.lock); if (error) { status = RPM_SUSPENDED; goto out; } } out_set: __update_runtime_status(dev, status); if (!error) dev->power.runtime_error = 0; out: spin_unlock_irqrestore(&dev->power.lock, flags); if (notify_parent) pm_request_idle(parent); if (status == RPM_SUSPENDED) { int idx = device_links_read_lock(); rpm_put_suppliers(dev); device_links_read_unlock(idx); } pm_runtime_enable(dev); return error; } EXPORT_SYMBOL_GPL(__pm_runtime_set_status); /** * __pm_runtime_barrier - Cancel pending requests and wait for completions. * @dev: Device to handle. * * Flush all pending requests for the device from pm_wq and wait for all * runtime PM operations involving the device in progress to complete. * * Should be called under dev->power.lock with interrupts disabled. */ static void __pm_runtime_barrier(struct device *dev) { pm_runtime_deactivate_timer(dev); if (dev->power.request_pending) { dev->power.request = RPM_REQ_NONE; spin_unlock_irq(&dev->power.lock); cancel_work_sync(&dev->power.work); spin_lock_irq(&dev->power.lock); dev->power.request_pending = false; } if (dev->power.runtime_status == RPM_SUSPENDING || dev->power.runtime_status == RPM_RESUMING || dev->power.idle_notification) { DEFINE_WAIT(wait); /* Suspend, wake-up or idle notification in progress. */ for (;;) { prepare_to_wait(&dev->power.wait_queue, &wait, TASK_UNINTERRUPTIBLE); if (dev->power.runtime_status != RPM_SUSPENDING && dev->power.runtime_status != RPM_RESUMING && !dev->power.idle_notification) break; spin_unlock_irq(&dev->power.lock); schedule(); spin_lock_irq(&dev->power.lock); } finish_wait(&dev->power.wait_queue, &wait); } } /** * pm_runtime_barrier - Flush pending requests and wait for completions. * @dev: Device to handle. * * Prevent the device from being suspended by incrementing its usage counter and * if there's a pending resume request for the device, wake the device up. * Next, make sure that all pending requests for the device have been flushed * from pm_wq and wait for all runtime PM operations involving the device in * progress to complete. * * Return value: * 1, if there was a resume request pending and the device had to be woken up, * 0, otherwise */ int pm_runtime_barrier(struct device *dev) { int retval = 0; pm_runtime_get_noresume(dev); spin_lock_irq(&dev->power.lock); if (dev->power.request_pending && dev->power.request == RPM_REQ_RESUME) { rpm_resume(dev, 0); retval = 1; } __pm_runtime_barrier(dev); spin_unlock_irq(&dev->power.lock); pm_runtime_put_noidle(dev); return retval; } EXPORT_SYMBOL_GPL(pm_runtime_barrier); /** * __pm_runtime_disable - Disable runtime PM of a device. * @dev: Device to handle. * @check_resume: If set, check if there's a resume request for the device. * * Increment power.disable_depth for the device and if it was zero previously, * cancel all pending runtime PM requests for the device and wait for all * operations in progress to complete. The device can be either active or * suspended after its runtime PM has been disabled. * * If @check_resume is set and there's a resume request pending when * __pm_runtime_disable() is called and power.disable_depth is zero, the * function will wake up the device before disabling its runtime PM. */ void __pm_runtime_disable(struct device *dev, bool check_resume) { spin_lock_irq(&dev->power.lock); if (dev->power.disable_depth > 0) { dev->power.disable_depth++; goto out; } /* * Wake up the device if there's a resume request pending, because that * means there probably is some I/O to process and disabling runtime PM * shouldn't prevent the device from processing the I/O. */ if (check_resume && dev->power.request_pending && dev->power.request == RPM_REQ_RESUME) { /* * Prevent suspends and idle notifications from being carried * out after we have woken up the device. */ pm_runtime_get_noresume(dev); rpm_resume(dev, 0); pm_runtime_put_noidle(dev); } /* Update time accounting before disabling PM-runtime. */ update_pm_runtime_accounting(dev); if (!dev->power.disable_depth++) { __pm_runtime_barrier(dev); dev->power.last_status = dev->power.runtime_status; } out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(__pm_runtime_disable); /** * pm_runtime_enable - Enable runtime PM of a device. * @dev: Device to handle. */ void pm_runtime_enable(struct device *dev) { unsigned long flags; spin_lock_irqsave(&dev->power.lock, flags); if (!dev->power.disable_depth) { dev_warn(dev, "Unbalanced %s!\n", __func__); goto out; } if (--dev->power.disable_depth > 0) goto out; dev->power.last_status = RPM_INVALID; dev->power.accounting_timestamp = ktime_get_mono_fast_ns(); if (dev->power.runtime_status == RPM_SUSPENDED && !dev->power.ignore_children && atomic_read(&dev->power.child_count) > 0) dev_warn(dev, "Enabling runtime PM for inactive device with active children\n"); out: spin_unlock_irqrestore(&dev->power.lock, flags); } EXPORT_SYMBOL_GPL(pm_runtime_enable); static void pm_runtime_disable_action(void *data) { pm_runtime_dont_use_autosuspend(data); pm_runtime_disable(data); } /** * devm_pm_runtime_enable - devres-enabled version of pm_runtime_enable. * * NOTE: this will also handle calling pm_runtime_dont_use_autosuspend() for * you at driver exit time if needed. * * @dev: Device to handle. */ int devm_pm_runtime_enable(struct device *dev) { pm_runtime_enable(dev); return devm_add_action_or_reset(dev, pm_runtime_disable_action, dev); } EXPORT_SYMBOL_GPL(devm_pm_runtime_enable); /** * pm_runtime_forbid - Block runtime PM of a device. * @dev: Device to handle. * * Increase the device's usage count and clear its power.runtime_auto flag, * so that it cannot be suspended at run time until pm_runtime_allow() is called * for it. */ void pm_runtime_forbid(struct device *dev) { spin_lock_irq(&dev->power.lock); if (!dev->power.runtime_auto) goto out; dev->power.runtime_auto = false; atomic_inc(&dev->power.usage_count); rpm_resume(dev, 0); out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_forbid); /** * pm_runtime_allow - Unblock runtime PM of a device. * @dev: Device to handle. * * Decrease the device's usage count and set its power.runtime_auto flag. */ void pm_runtime_allow(struct device *dev) { int ret; spin_lock_irq(&dev->power.lock); if (dev->power.runtime_auto) goto out; dev->power.runtime_auto = true; ret = rpm_drop_usage_count(dev); if (ret == 0) rpm_idle(dev, RPM_AUTO | RPM_ASYNC); else if (ret > 0) trace_rpm_usage(dev, RPM_AUTO | RPM_ASYNC); out: spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_allow); /** * pm_runtime_no_callbacks - Ignore runtime PM callbacks for a device. * @dev: Device to handle. * * Set the power.no_callbacks flag, which tells the PM core that this * device is power-managed through its parent and has no runtime PM * callbacks of its own. The runtime sysfs attributes will be removed. */ void pm_runtime_no_callbacks(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.no_callbacks = 1; spin_unlock_irq(&dev->power.lock); if (device_is_registered(dev)) rpm_sysfs_remove(dev); } EXPORT_SYMBOL_GPL(pm_runtime_no_callbacks); /** * pm_runtime_irq_safe - Leave interrupts disabled during callbacks. * @dev: Device to handle * * Set the power.irq_safe flag, which tells the PM core that the * ->runtime_suspend() and ->runtime_resume() callbacks for this device should * always be invoked with the spinlock held and interrupts disabled. It also * causes the parent's usage counter to be permanently incremented, preventing * the parent from runtime suspending -- otherwise an irq-safe child might have * to wait for a non-irq-safe parent. */ void pm_runtime_irq_safe(struct device *dev) { if (dev->parent) pm_runtime_get_sync(dev->parent); spin_lock_irq(&dev->power.lock); dev->power.irq_safe = 1; spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_irq_safe); /** * update_autosuspend - Handle a change to a device's autosuspend settings. * @dev: Device to handle. * @old_delay: The former autosuspend_delay value. * @old_use: The former use_autosuspend value. * * Prevent runtime suspend if the new delay is negative and use_autosuspend is * set; otherwise allow it. Send an idle notification if suspends are allowed. * * This function must be called under dev->power.lock with interrupts disabled. */ static void update_autosuspend(struct device *dev, int old_delay, int old_use) { int delay = dev->power.autosuspend_delay; /* Should runtime suspend be prevented now? */ if (dev->power.use_autosuspend && delay < 0) { /* If it used to be allowed then prevent it. */ if (!old_use || old_delay >= 0) { atomic_inc(&dev->power.usage_count); rpm_resume(dev, 0); } else { trace_rpm_usage(dev, 0); } } /* Runtime suspend should be allowed now. */ else { /* If it used to be prevented then allow it. */ if (old_use && old_delay < 0) atomic_dec(&dev->power.usage_count); /* Maybe we can autosuspend now. */ rpm_idle(dev, RPM_AUTO); } } /** * pm_runtime_set_autosuspend_delay - Set a device's autosuspend_delay value. * @dev: Device to handle. * @delay: Value of the new delay in milliseconds. * * Set the device's power.autosuspend_delay value. If it changes to negative * and the power.use_autosuspend flag is set, prevent runtime suspends. If it * changes the other way, allow runtime suspends. */ void pm_runtime_set_autosuspend_delay(struct device *dev, int delay) { int old_delay, old_use; spin_lock_irq(&dev->power.lock); old_delay = dev->power.autosuspend_delay; old_use = dev->power.use_autosuspend; dev->power.autosuspend_delay = delay; update_autosuspend(dev, old_delay, old_use); spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(pm_runtime_set_autosuspend_delay); /** * __pm_runtime_use_autosuspend - Set a device's use_autosuspend flag. * @dev: Device to handle. * @use: New value for use_autosuspend. * * Set the device's power.use_autosuspend flag, and allow or prevent runtime * suspends as needed. */ void __pm_runtime_use_autosuspend(struct device *dev, bool use) { int old_delay, old_use; spin_lock_irq(&dev->power.lock); old_delay = dev->power.autosuspend_delay; old_use = dev->power.use_autosuspend; dev->power.use_autosuspend = use; update_autosuspend(dev, old_delay, old_use); spin_unlock_irq(&dev->power.lock); } EXPORT_SYMBOL_GPL(__pm_runtime_use_autosuspend); /** * pm_runtime_init - Initialize runtime PM fields in given device object. * @dev: Device object to initialize. */ void pm_runtime_init(struct device *dev) { dev->power.runtime_status = RPM_SUSPENDED; dev->power.last_status = RPM_INVALID; dev->power.idle_notification = false; dev->power.disable_depth = 1; atomic_set(&dev->power.usage_count, 0); dev->power.runtime_error = 0; atomic_set(&dev->power.child_count, 0); pm_suspend_ignore_children(dev, false); dev->power.runtime_auto = true; dev->power.request_pending = false; dev->power.request = RPM_REQ_NONE; dev->power.deferred_resume = false; dev->power.needs_force_resume = 0; INIT_WORK(&dev->power.work, pm_runtime_work); dev->power.timer_expires = 0; hrtimer_init(&dev->power.suspend_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); dev->power.suspend_timer.function = pm_suspend_timer_fn; init_waitqueue_head(&dev->power.wait_queue); } /** * pm_runtime_reinit - Re-initialize runtime PM fields in given device object. * @dev: Device object to re-initialize. */ void pm_runtime_reinit(struct device *dev) { if (!pm_runtime_enabled(dev)) { if (dev->power.runtime_status == RPM_ACTIVE) pm_runtime_set_suspended(dev); if (dev->power.irq_safe) { spin_lock_irq(&dev->power.lock); dev->power.irq_safe = 0; spin_unlock_irq(&dev->power.lock); if (dev->parent) pm_runtime_put(dev->parent); } } } /** * pm_runtime_remove - Prepare for removing a device from device hierarchy. * @dev: Device object being removed from device hierarchy. */ void pm_runtime_remove(struct device *dev) { __pm_runtime_disable(dev, false); pm_runtime_reinit(dev); } /** * pm_runtime_get_suppliers - Resume and reference-count supplier devices. * @dev: Consumer device. */ void pm_runtime_get_suppliers(struct device *dev) { struct device_link *link; int idx; idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) if (link->flags & DL_FLAG_PM_RUNTIME) { link->supplier_preactivated = true; pm_runtime_get_sync(link->supplier); } device_links_read_unlock(idx); } /** * pm_runtime_put_suppliers - Drop references to supplier devices. * @dev: Consumer device. */ void pm_runtime_put_suppliers(struct device *dev) { struct device_link *link; int idx; idx = device_links_read_lock(); list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, device_links_read_lock_held()) if (link->supplier_preactivated) { link->supplier_preactivated = false; pm_runtime_put(link->supplier); } device_links_read_unlock(idx); } void pm_runtime_new_link(struct device *dev) { spin_lock_irq(&dev->power.lock); dev->power.links_count++; spin_unlock_irq(&dev->power.lock); } static void pm_runtime_drop_link_count(struct device *dev) { spin_lock_irq(&dev->power.lock); WARN_ON(dev->power.links_count == 0); dev->power.links_count--; spin_unlock_irq(&dev->power.lock); } /** * pm_runtime_drop_link - Prepare for device link removal. * @link: Device link going away. * * Drop the link count of the consumer end of @link and decrement the supplier * device's runtime PM usage counter as many times as needed to drop all of the * PM runtime reference to it from the consumer. */ void pm_runtime_drop_link(struct device_link *link) { if (!(link->flags & DL_FLAG_PM_RUNTIME)) return; pm_runtime_drop_link_count(link->consumer); pm_runtime_release_supplier(link); pm_request_idle(link->supplier); } static bool pm_runtime_need_not_resume(struct device *dev) { return atomic_read(&dev->power.usage_count) <= 1 && (atomic_read(&dev->power.child_count) == 0 || dev->power.ignore_children); } /** * pm_runtime_force_suspend - Force a device into suspend state if needed. * @dev: Device to suspend. * * Disable runtime PM so we safely can check the device's runtime PM status and * if it is active, invoke its ->runtime_suspend callback to suspend it and * change its runtime PM status field to RPM_SUSPENDED. Also, if the device's * usage and children counters don't indicate that the device was in use before * the system-wide transition under way, decrement its parent's children counter * (if there is a parent). Keep runtime PM disabled to preserve the state * unless we encounter errors. * * Typically this function may be invoked from a system suspend callback to make * sure the device is put into low power state and it should only be used during * system-wide PM transitions to sleep states. It assumes that the analogous * pm_runtime_force_resume() will be used to resume the device. * * Do not use with DPM_FLAG_SMART_SUSPEND as this can lead to an inconsistent * state where this function has called the ->runtime_suspend callback but the * PM core marks the driver as runtime active. */ int pm_runtime_force_suspend(struct device *dev) { int (*callback)(struct device *); int ret; pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) return 0; callback = RPM_GET_CALLBACK(dev, runtime_suspend); dev_pm_enable_wake_irq_check(dev, true); ret = callback ? callback(dev) : 0; if (ret) goto err; dev_pm_enable_wake_irq_complete(dev); /* * If the device can stay in suspend after the system-wide transition * to the working state that will follow, drop the children counter of * its parent, but set its status to RPM_SUSPENDED anyway in case this * function will be called again for it in the meantime. */ if (pm_runtime_need_not_resume(dev)) { pm_runtime_set_suspended(dev); } else { __update_runtime_status(dev, RPM_SUSPENDED); dev->power.needs_force_resume = 1; } return 0; err: dev_pm_disable_wake_irq_check(dev, true); pm_runtime_enable(dev); return ret; } EXPORT_SYMBOL_GPL(pm_runtime_force_suspend); /** * pm_runtime_force_resume - Force a device into resume state if needed. * @dev: Device to resume. * * Prior invoking this function we expect the user to have brought the device * into low power state by a call to pm_runtime_force_suspend(). Here we reverse * those actions and bring the device into full power, if it is expected to be * used on system resume. In the other case, we defer the resume to be managed * via runtime PM. * * Typically this function may be invoked from a system resume callback. */ int pm_runtime_force_resume(struct device *dev) { int (*callback)(struct device *); int ret = 0; if (!pm_runtime_status_suspended(dev) || !dev->power.needs_force_resume) goto out; /* * The value of the parent's children counter is correct already, so * just update the status of the device. */ __update_runtime_status(dev, RPM_ACTIVE); callback = RPM_GET_CALLBACK(dev, runtime_resume); dev_pm_disable_wake_irq_check(dev, false); ret = callback ? callback(dev) : 0; if (ret) { pm_runtime_set_suspended(dev); dev_pm_enable_wake_irq_check(dev, false); goto out; } pm_runtime_mark_last_busy(dev); out: dev->power.needs_force_resume = 0; pm_runtime_enable(dev); return ret; } EXPORT_SYMBOL_GPL(pm_runtime_force_resume); |
4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 | // SPDX-License-Identifier: GPL-2.0-only /* * scsi.c Copyright (C) 1992 Drew Eckhardt * Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale * Copyright (C) 2002, 2003 Christoph Hellwig * * generic mid-level SCSI driver * Initial versions: Drew Eckhardt * Subsequent revisions: Eric Youngdale * * <drew@colorado.edu> * * Bug correction thanks go to : * Rik Faith <faith@cs.unc.edu> * Tommy Thorn <tthorn> * Thomas Wuensche <tw@fgb1.fgb.mw.tu-muenchen.de> * * Modified by Eric Youngdale eric@andante.org or ericy@gnu.ai.mit.edu to * add scatter-gather, multiple outstanding request, and other * enhancements. * * Native multichannel, wide scsi, /proc/scsi and hot plugging * support added by Michael Neuffer <mike@i-connect.net> * * Added request_module("scsi_hostadapter") for kerneld: * (Put an "alias scsi_hostadapter your_hostadapter" in /etc/modprobe.conf) * Bjorn Ekwall <bj0rn@blox.se> * (changed to kmod) * * Major improvements to the timeout, abort, and reset processing, * as well as performance modifications for large queue depths by * Leonard N. Zubkoff <lnz@dandelion.com> * * Converted cli() code to spinlocks, Ingo Molnar * * Jiffies wrap fixes (host->resetting), 3 Dec 1998 Andrea Arcangeli * * out_of_space hacks, D. Gilbert (dpg) 990608 */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/unistd.h> #include <linux/spinlock.h> #include <linux/kmod.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_tcq.h> #include "scsi_priv.h" #include "scsi_logging.h" #define CREATE_TRACE_POINTS #include <trace/events/scsi.h> /* * Definitions and constants. */ /* * Note - the initial logging level can be set here to log events at boot time. * After the system is up, you may enable logging via the /proc interface. */ unsigned int scsi_logging_level; #if defined(CONFIG_SCSI_LOGGING) EXPORT_SYMBOL(scsi_logging_level); #endif #ifdef CONFIG_SCSI_LOGGING void scsi_log_send(struct scsi_cmnd *cmd) { unsigned int level; /* * If ML QUEUE log level is greater than or equal to: * * 1: nothing (match completion) * * 2: log opcode + command of all commands + cmd address * * 3: same as 2 * * 4: same as 3 */ if (unlikely(scsi_logging_level)) { level = SCSI_LOG_LEVEL(SCSI_LOG_MLQUEUE_SHIFT, SCSI_LOG_MLQUEUE_BITS); if (level > 1) { scmd_printk(KERN_INFO, cmd, "Send: scmd 0x%p\n", cmd); scsi_print_command(cmd); } } } void scsi_log_completion(struct scsi_cmnd *cmd, int disposition) { unsigned int level; /* * If ML COMPLETE log level is greater than or equal to: * * 1: log disposition, result, opcode + command, and conditionally * sense data for failures or non SUCCESS dispositions. * * 2: same as 1 but for all command completions. * * 3: same as 2 * * 4: same as 3 plus dump extra junk */ if (unlikely(scsi_logging_level)) { level = SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT, SCSI_LOG_MLCOMPLETE_BITS); if (((level > 0) && (cmd->result || disposition != SUCCESS)) || (level > 1)) { scsi_print_result(cmd, "Done", disposition); scsi_print_command(cmd); if (scsi_status_is_check_condition(cmd->result)) scsi_print_sense(cmd); if (level > 3) scmd_printk(KERN_INFO, cmd, "scsi host busy %d failed %d\n", scsi_host_busy(cmd->device->host), cmd->device->host->host_failed); } } } #endif /** * scsi_finish_command - cleanup and pass command back to upper layer * @cmd: the command * * Description: Pass command off to upper layer for finishing of I/O * request, waking processes that are waiting on results, * etc. */ void scsi_finish_command(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct scsi_target *starget = scsi_target(sdev); struct Scsi_Host *shost = sdev->host; struct scsi_driver *drv; unsigned int good_bytes; scsi_device_unbusy(sdev, cmd); /* * Clear the flags that say that the device/target/host is no longer * capable of accepting new commands. */ if (atomic_read(&shost->host_blocked)) atomic_set(&shost->host_blocked, 0); if (atomic_read(&starget->target_blocked)) atomic_set(&starget->target_blocked, 0); if (atomic_read(&sdev->device_blocked)) atomic_set(&sdev->device_blocked, 0); SCSI_LOG_MLCOMPLETE(4, sdev_printk(KERN_INFO, sdev, "Notifying upper driver of completion " "(result %x)\n", cmd->result)); good_bytes = scsi_bufflen(cmd); if (!blk_rq_is_passthrough(scsi_cmd_to_rq(cmd))) { int old_good_bytes = good_bytes; drv = scsi_cmd_to_driver(cmd); if (drv->done) good_bytes = drv->done(cmd); /* * USB may not give sense identifying bad sector and * simply return a residue instead, so subtract off the * residue if drv->done() error processing indicates no * change to the completion length. */ if (good_bytes == old_good_bytes) good_bytes -= scsi_get_resid(cmd); } scsi_io_completion(cmd, good_bytes); } /* * 4096 is big enough for saturating fast SCSI LUNs. */ int scsi_device_max_queue_depth(struct scsi_device *sdev) { return min_t(int, sdev->host->can_queue, 4096); } /** * scsi_change_queue_depth - change a device's queue depth * @sdev: SCSI Device in question * @depth: number of commands allowed to be queued to the driver * * Sets the device queue depth and returns the new value. */ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) { depth = min_t(int, depth, scsi_device_max_queue_depth(sdev)); if (depth > 0) { sdev->queue_depth = depth; wmb(); } if (sdev->request_queue) blk_set_queue_depth(sdev->request_queue, depth); sbitmap_resize(&sdev->budget_map, sdev->queue_depth); return sdev->queue_depth; } EXPORT_SYMBOL(scsi_change_queue_depth); /** * scsi_track_queue_full - track QUEUE_FULL events to adjust queue depth * @sdev: SCSI Device in question * @depth: Current number of outstanding SCSI commands on this device, * not counting the one returned as QUEUE_FULL. * * Description: This function will track successive QUEUE_FULL events on a * specific SCSI device to determine if and when there is a * need to adjust the queue depth on the device. * * Returns: 0 - No change needed, >0 - Adjust queue depth to this new depth, * -1 - Drop back to untagged operation using host->cmd_per_lun * as the untagged command depth * * Lock Status: None held on entry * * Notes: Low level drivers may call this at any time and we will do * "The Right Thing." We are interrupt context safe. */ int scsi_track_queue_full(struct scsi_device *sdev, int depth) { /* * Don't let QUEUE_FULLs on the same * jiffies count, they could all be from * same event. */ if ((jiffies >> 4) == (sdev->last_queue_full_time >> 4)) return 0; sdev->last_queue_full_time = jiffies; if (sdev->last_queue_full_depth != depth) { sdev->last_queue_full_count = 1; sdev->last_queue_full_depth = depth; } else { sdev->last_queue_full_count++; } if (sdev->last_queue_full_count <= 10) return 0; return scsi_change_queue_depth(sdev, depth); } EXPORT_SYMBOL(scsi_track_queue_full); /** * scsi_vpd_inquiry - Request a device provide us with a VPD page * @sdev: The device to ask * @buffer: Where to put the result * @page: Which Vital Product Data to return * @len: The length of the buffer * * This is an internal helper function. You probably want to use * scsi_get_vpd_page instead. * * Returns size of the vpd page on success or a negative error number. */ static int scsi_vpd_inquiry(struct scsi_device *sdev, unsigned char *buffer, u8 page, unsigned len) { int result; unsigned char cmd[16]; if (len < 4) return -EINVAL; cmd[0] = INQUIRY; cmd[1] = 1; /* EVPD */ cmd[2] = page; cmd[3] = len >> 8; cmd[4] = len & 0xff; cmd[5] = 0; /* Control byte */ /* * I'm not convinced we need to try quite this hard to get VPD, but * all the existing users tried this hard. */ result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, len, 30 * HZ, 3, NULL); if (result) return -EIO; /* * Sanity check that we got the page back that we asked for and that * the page size is not 0. */ if (buffer[1] != page) return -EIO; result = get_unaligned_be16(&buffer[2]); if (!result) return -EIO; return result + 4; } enum scsi_vpd_parameters { SCSI_VPD_HEADER_SIZE = 4, SCSI_VPD_LIST_SIZE = 36, }; static int scsi_get_vpd_size(struct scsi_device *sdev, u8 page) { unsigned char vpd[SCSI_VPD_LIST_SIZE] __aligned(4); int result; if (sdev->no_vpd_size) return SCSI_DEFAULT_VPD_LEN; /* * Fetch the supported pages VPD and validate that the requested page * number is present. */ if (page != 0) { result = scsi_vpd_inquiry(sdev, vpd, 0, sizeof(vpd)); if (result < SCSI_VPD_HEADER_SIZE) return 0; if (result > sizeof(vpd)) { dev_warn_once(&sdev->sdev_gendev, "%s: long VPD page 0 length: %d bytes\n", __func__, result); result = sizeof(vpd); } result -= SCSI_VPD_HEADER_SIZE; if (!memchr(&vpd[SCSI_VPD_HEADER_SIZE], page, result)) return 0; } /* * Fetch the VPD page header to find out how big the page * is. This is done to prevent problems on legacy devices * which can not handle allocation lengths as large as * potentially requested by the caller. */ result = scsi_vpd_inquiry(sdev, vpd, page, SCSI_VPD_HEADER_SIZE); if (result < 0) return 0; if (result < SCSI_VPD_HEADER_SIZE) { dev_warn_once(&sdev->sdev_gendev, "%s: short VPD page 0x%02x length: %d bytes\n", __func__, page, result); return 0; } return result; } /** * scsi_get_vpd_page - Get Vital Product Data from a SCSI device * @sdev: The device to ask * @page: Which Vital Product Data to return * @buf: where to store the VPD * @buf_len: number of bytes in the VPD buffer area * * SCSI devices may optionally supply Vital Product Data. Each 'page' * of VPD is defined in the appropriate SCSI document (eg SPC, SBC). * If the device supports this VPD page, this routine fills @buf * with the data from that page and return 0. If the VPD page is not * supported or its content cannot be retrieved, -EINVAL is returned. */ int scsi_get_vpd_page(struct scsi_device *sdev, u8 page, unsigned char *buf, int buf_len) { int result, vpd_len; if (!scsi_device_supports_vpd(sdev)) return -EINVAL; vpd_len = scsi_get_vpd_size(sdev, page); if (vpd_len <= 0) return -EINVAL; vpd_len = min(vpd_len, buf_len); /* * Fetch the actual page. Since the appropriate size was reported * by the device it is now safe to ask for something bigger. */ memset(buf, 0, buf_len); result = scsi_vpd_inquiry(sdev, buf, page, vpd_len); if (result < 0) return -EINVAL; else if (result > vpd_len) dev_warn_once(&sdev->sdev_gendev, "%s: VPD page 0x%02x result %d > %d bytes\n", __func__, page, result, vpd_len); return 0; } EXPORT_SYMBOL_GPL(scsi_get_vpd_page); /** * scsi_get_vpd_buf - Get Vital Product Data from a SCSI device * @sdev: The device to ask * @page: Which Vital Product Data to return * * Returns %NULL upon failure. */ static struct scsi_vpd *scsi_get_vpd_buf(struct scsi_device *sdev, u8 page) { struct scsi_vpd *vpd_buf; int vpd_len, result; vpd_len = scsi_get_vpd_size(sdev, page); if (vpd_len <= 0) return NULL; retry_pg: /* * Fetch the actual page. Since the appropriate size was reported * by the device it is now safe to ask for something bigger. */ vpd_buf = kmalloc(sizeof(*vpd_buf) + vpd_len, GFP_KERNEL); if (!vpd_buf) return NULL; result = scsi_vpd_inquiry(sdev, vpd_buf->data, page, vpd_len); if (result < 0) { kfree(vpd_buf); return NULL; } if (result > vpd_len) { dev_warn_once(&sdev->sdev_gendev, "%s: VPD page 0x%02x result %d > %d bytes\n", __func__, page, result, vpd_len); vpd_len = result; kfree(vpd_buf); goto retry_pg; } vpd_buf->len = result; return vpd_buf; } static void scsi_update_vpd_page(struct scsi_device *sdev, u8 page, struct scsi_vpd __rcu **sdev_vpd_buf) { struct scsi_vpd *vpd_buf; vpd_buf = scsi_get_vpd_buf(sdev, page); if (!vpd_buf) return; mutex_lock(&sdev->inquiry_mutex); vpd_buf = rcu_replace_pointer(*sdev_vpd_buf, vpd_buf, lockdep_is_held(&sdev->inquiry_mutex)); mutex_unlock(&sdev->inquiry_mutex); if (vpd_buf) kfree_rcu(vpd_buf, rcu); } /** * scsi_attach_vpd - Attach Vital Product Data to a SCSI device structure * @sdev: The device to ask * * Attach the 'Device Identification' VPD page (0x83) and the * 'Unit Serial Number' VPD page (0x80) to a SCSI device * structure. This information can be used to identify the device * uniquely. */ void scsi_attach_vpd(struct scsi_device *sdev) { int i; struct scsi_vpd *vpd_buf; if (!scsi_device_supports_vpd(sdev)) return; /* Ask for all the pages supported by this device */ vpd_buf = scsi_get_vpd_buf(sdev, 0); if (!vpd_buf) return; for (i = 4; i < vpd_buf->len; i++) { if (vpd_buf->data[i] == 0x0) scsi_update_vpd_page(sdev, 0x0, &sdev->vpd_pg0); if (vpd_buf->data[i] == 0x80) scsi_update_vpd_page(sdev, 0x80, &sdev->vpd_pg80); if (vpd_buf->data[i] == 0x83) scsi_update_vpd_page(sdev, 0x83, &sdev->vpd_pg83); if (vpd_buf->data[i] == 0x89) scsi_update_vpd_page(sdev, 0x89, &sdev->vpd_pg89); if (vpd_buf->data[i] == 0xb0) scsi_update_vpd_page(sdev, 0xb0, &sdev->vpd_pgb0); if (vpd_buf->data[i] == 0xb1) scsi_update_vpd_page(sdev, 0xb1, &sdev->vpd_pgb1); if (vpd_buf->data[i] == 0xb2) scsi_update_vpd_page(sdev, 0xb2, &sdev->vpd_pgb2); if (vpd_buf->data[i] == 0xb7) scsi_update_vpd_page(sdev, 0xb7, &sdev->vpd_pgb7); } kfree(vpd_buf); } /** * scsi_report_opcode - Find out if a given command is supported * @sdev: scsi device to query * @buffer: scratch buffer (must be at least 20 bytes long) * @len: length of buffer * @opcode: opcode for the command to look up * @sa: service action for the command to look up * * Uses the REPORT SUPPORTED OPERATION CODES to check support for the * command identified with @opcode and @sa. If the command does not * have a service action, @sa must be 0. Returns -EINVAL if RSOC fails, * 0 if the command is not supported and 1 if the device claims to * support the command. */ int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer, unsigned int len, unsigned char opcode, unsigned short sa) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; int result, request_len; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; if (sdev->no_report_opcodes || sdev->scsi_level < SCSI_SPC_3) return -EINVAL; /* RSOC header + size of command we are asking about */ request_len = 4 + COMMAND_SIZE(opcode); if (request_len > len) { dev_warn_once(&sdev->sdev_gendev, "%s: len %u bytes, opcode 0x%02x needs %u\n", __func__, len, opcode, request_len); return -EINVAL; } memset(cmd, 0, 16); cmd[0] = MAINTENANCE_IN; cmd[1] = MI_REPORT_SUPPORTED_OPERATION_CODES; if (!sa) { cmd[2] = 1; /* One command format */ cmd[3] = opcode; } else { cmd[2] = 3; /* One command format with service action */ cmd[3] = opcode; put_unaligned_be16(sa, &cmd[4]); } put_unaligned_be32(request_len, &cmd[6]); memset(buffer, 0, len); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, request_len, 30 * HZ, 3, &exec_args); if (result < 0) return result; if (result && scsi_sense_valid(&sshdr) && sshdr.sense_key == ILLEGAL_REQUEST && (sshdr.asc == 0x20 || sshdr.asc == 0x24) && sshdr.ascq == 0x00) return -EINVAL; if ((buffer[1] & 3) == 3) /* Command supported */ return 1; return 0; } EXPORT_SYMBOL(scsi_report_opcode); #define SCSI_CDL_CHECK_BUF_LEN 64 static bool scsi_cdl_check_cmd(struct scsi_device *sdev, u8 opcode, u16 sa, unsigned char *buf) { int ret; u8 cdlp; /* Check operation code */ ret = scsi_report_opcode(sdev, buf, SCSI_CDL_CHECK_BUF_LEN, opcode, sa); if (ret <= 0) return false; if ((buf[1] & 0x03) != 0x03) return false; /* * See SPC-6, One_command parameter data format for * REPORT SUPPORTED OPERATION CODES. We have the following cases * depending on rwcdlp (buf[0] & 0x01) value: * - rwcdlp == 0: then cdlp indicates support for the A mode page when * it is equal to 1 and for the B mode page when it is * equal to 2. * - rwcdlp == 1: then cdlp indicates support for the T2A mode page * when it is equal to 1 and for the T2B mode page when * it is equal to 2. * Overall, to detect support for command duration limits, we only need * to check that cdlp is 1 or 2. */ cdlp = (buf[1] & 0x18) >> 3; return cdlp == 0x01 || cdlp == 0x02; } /** * scsi_cdl_check - Check if a SCSI device supports Command Duration Limits * @sdev: The device to check */ void scsi_cdl_check(struct scsi_device *sdev) { bool cdl_supported; unsigned char *buf; /* * Support for CDL was defined in SPC-5. Ignore devices reporting an * lower SPC version. This also avoids problems with old drives choking * on MAINTENANCE_IN / MI_REPORT_SUPPORTED_OPERATION_CODES with a * service action specified, as done in scsi_cdl_check_cmd(). */ if (sdev->scsi_level < SCSI_SPC_5) { sdev->cdl_supported = 0; return; } buf = kmalloc(SCSI_CDL_CHECK_BUF_LEN, GFP_KERNEL); if (!buf) { sdev->cdl_supported = 0; return; } /* Check support for READ_16, WRITE_16, READ_32 and WRITE_32 commands */ cdl_supported = scsi_cdl_check_cmd(sdev, READ_16, 0, buf) || scsi_cdl_check_cmd(sdev, WRITE_16, 0, buf) || scsi_cdl_check_cmd(sdev, VARIABLE_LENGTH_CMD, READ_32, buf) || scsi_cdl_check_cmd(sdev, VARIABLE_LENGTH_CMD, WRITE_32, buf); if (cdl_supported) { /* * We have CDL support: force the use of READ16/WRITE16. * READ32 and WRITE32 will be used for devices that support * the T10_PI_TYPE2_PROTECTION protection type. */ sdev->use_16_for_rw = 1; sdev->use_10_for_rw = 0; sdev->cdl_supported = 1; /* * If the device supports CDL, make sure that the current drive * feature status is consistent with the user controlled * cdl_enable state. */ scsi_cdl_enable(sdev, sdev->cdl_enable); } else { sdev->cdl_supported = 0; } kfree(buf); } /** * scsi_cdl_enable - Enable or disable a SCSI device supports for Command * Duration Limits * @sdev: The target device * @enable: the target state */ int scsi_cdl_enable(struct scsi_device *sdev, bool enable) { struct scsi_mode_data data; struct scsi_sense_hdr sshdr; struct scsi_vpd *vpd; bool is_ata = false; char buf[64]; int ret; if (!sdev->cdl_supported) return -EOPNOTSUPP; rcu_read_lock(); vpd = rcu_dereference(sdev->vpd_pg89); if (vpd) is_ata = true; rcu_read_unlock(); /* * For ATA devices, CDL needs to be enabled with a SET FEATURES command. */ if (is_ata) { char *buf_data; int len; ret = scsi_mode_sense(sdev, 0x08, 0x0a, 0xf2, buf, sizeof(buf), 5 * HZ, 3, &data, NULL); if (ret) return -EINVAL; /* Enable CDL using the ATA feature page */ len = min_t(size_t, sizeof(buf), data.length - data.header_length - data.block_descriptor_length); buf_data = buf + data.header_length + data.block_descriptor_length; if (enable) buf_data[4] = 0x02; else buf_data[4] = 0; ret = scsi_mode_select(sdev, 1, 0, buf_data, len, 5 * HZ, 3, &data, &sshdr); if (ret) { if (ret > 0 && scsi_sense_valid(&sshdr)) scsi_print_sense_hdr(sdev, dev_name(&sdev->sdev_gendev), &sshdr); return ret; } } sdev->cdl_enable = enable; return 0; } /** * scsi_device_get - get an additional reference to a scsi_device * @sdev: device to get a reference to * * Description: Gets a reference to the scsi_device and increments the use count * of the underlying LLDD module. You must hold host_lock of the * parent Scsi_Host or already have a reference when calling this. * * This will fail if a device is deleted or cancelled, or when the LLD module * is in the process of being unloaded. */ int scsi_device_get(struct scsi_device *sdev) { if (sdev->sdev_state == SDEV_DEL || sdev->sdev_state == SDEV_CANCEL) goto fail; if (!try_module_get(sdev->host->hostt->module)) goto fail; if (!get_device(&sdev->sdev_gendev)) goto fail_put_module; return 0; fail_put_module: module_put(sdev->host->hostt->module); fail: return -ENXIO; } EXPORT_SYMBOL(scsi_device_get); /** * scsi_device_put - release a reference to a scsi_device * @sdev: device to release a reference on. * * Description: Release a reference to the scsi_device and decrements the use * count of the underlying LLDD module. The device is freed once the last * user vanishes. */ void scsi_device_put(struct scsi_device *sdev) { struct module *mod = sdev->host->hostt->module; put_device(&sdev->sdev_gendev); module_put(mod); } EXPORT_SYMBOL(scsi_device_put); /* helper for shost_for_each_device, see that for documentation */ struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *shost, struct scsi_device *prev) { struct list_head *list = (prev ? &prev->siblings : &shost->__devices); struct scsi_device *next = NULL; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); while (list->next != &shost->__devices) { next = list_entry(list->next, struct scsi_device, siblings); /* skip devices that we can't get a reference to */ if (!scsi_device_get(next)) break; next = NULL; list = list->next; } spin_unlock_irqrestore(shost->host_lock, flags); if (prev) scsi_device_put(prev); return next; } EXPORT_SYMBOL(__scsi_iterate_devices); /** * starget_for_each_device - helper to walk all devices of a target * @starget: target whose devices we want to iterate over. * @data: Opaque passed to each function call. * @fn: Function to call on each device * * This traverses over each device of @starget. The devices have * a reference that must be released by scsi_host_put when breaking * out of the loop. */ void starget_for_each_device(struct scsi_target *starget, void *data, void (*fn)(struct scsi_device *, void *)) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct scsi_device *sdev; shost_for_each_device(sdev, shost) { if ((sdev->channel == starget->channel) && (sdev->id == starget->id)) fn(sdev, data); } } EXPORT_SYMBOL(starget_for_each_device); /** * __starget_for_each_device - helper to walk all devices of a target (UNLOCKED) * @starget: target whose devices we want to iterate over. * @data: parameter for callback @fn() * @fn: callback function that is invoked for each device * * This traverses over each device of @starget. It does _not_ * take a reference on the scsi_device, so the whole loop must be * protected by shost->host_lock. * * Note: The only reason why drivers would want to use this is because * they need to access the device list in irq context. Otherwise you * really want to use starget_for_each_device instead. **/ void __starget_for_each_device(struct scsi_target *starget, void *data, void (*fn)(struct scsi_device *, void *)) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct scsi_device *sdev; __shost_for_each_device(sdev, shost) { if ((sdev->channel == starget->channel) && (sdev->id == starget->id)) fn(sdev, data); } } EXPORT_SYMBOL(__starget_for_each_device); /** * __scsi_device_lookup_by_target - find a device given the target (UNLOCKED) * @starget: SCSI target pointer * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @lun for a given * @starget. The returned scsi_device does not have an additional * reference. You must hold the host's host_lock over this call and * any access to the returned scsi_device. A scsi_device in state * SDEV_DEL is skipped. * * Note: The only reason why drivers should use this is because * they need to access the device list in irq context. Otherwise you * really want to use scsi_device_lookup_by_target instead. **/ struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *starget, u64 lun) { struct scsi_device *sdev; list_for_each_entry(sdev, &starget->devices, same_target_siblings) { if (sdev->sdev_state == SDEV_DEL) continue; if (sdev->lun ==lun) return sdev; } return NULL; } EXPORT_SYMBOL(__scsi_device_lookup_by_target); /** * scsi_device_lookup_by_target - find a device given the target * @starget: SCSI target pointer * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @lun for a given * @starget. The returned scsi_device has an additional reference that * needs to be released with scsi_device_put once you're done with it. **/ struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *starget, u64 lun) { struct scsi_device *sdev; struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); sdev = __scsi_device_lookup_by_target(starget, lun); if (sdev && scsi_device_get(sdev)) sdev = NULL; spin_unlock_irqrestore(shost->host_lock, flags); return sdev; } EXPORT_SYMBOL(scsi_device_lookup_by_target); /** * __scsi_device_lookup - find a device given the host (UNLOCKED) * @shost: SCSI host pointer * @channel: SCSI channel (zero if only one channel) * @id: SCSI target number (physical unit number) * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @channel, @id, @lun * for a given host. The returned scsi_device does not have an additional * reference. You must hold the host's host_lock over this call and any access * to the returned scsi_device. * * Note: The only reason why drivers would want to use this is because * they need to access the device list in irq context. Otherwise you * really want to use scsi_device_lookup instead. **/ struct scsi_device *__scsi_device_lookup(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct scsi_device *sdev; list_for_each_entry(sdev, &shost->__devices, siblings) { if (sdev->sdev_state == SDEV_DEL) continue; if (sdev->channel == channel && sdev->id == id && sdev->lun ==lun) return sdev; } return NULL; } EXPORT_SYMBOL(__scsi_device_lookup); /** * scsi_device_lookup - find a device given the host * @shost: SCSI host pointer * @channel: SCSI channel (zero if only one channel) * @id: SCSI target number (physical unit number) * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @channel, @id, @lun * for a given host. The returned scsi_device has an additional reference that * needs to be released with scsi_device_put once you're done with it. **/ struct scsi_device *scsi_device_lookup(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct scsi_device *sdev; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); sdev = __scsi_device_lookup(shost, channel, id, lun); if (sdev && scsi_device_get(sdev)) sdev = NULL; spin_unlock_irqrestore(shost->host_lock, flags); return sdev; } EXPORT_SYMBOL(scsi_device_lookup); MODULE_DESCRIPTION("SCSI core"); MODULE_LICENSE("GPL"); module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels"); static int __init init_scsi(void) { int error; error = scsi_init_procfs(); if (error) goto cleanup_queue; error = scsi_init_devinfo(); if (error) goto cleanup_procfs; error = scsi_init_hosts(); if (error) goto cleanup_devlist; error = scsi_init_sysctl(); if (error) goto cleanup_hosts; error = scsi_sysfs_register(); if (error) goto cleanup_sysctl; scsi_netlink_init(); printk(KERN_NOTICE "SCSI subsystem initialized\n"); return 0; cleanup_sysctl: scsi_exit_sysctl(); cleanup_hosts: scsi_exit_hosts(); cleanup_devlist: scsi_exit_devinfo(); cleanup_procfs: scsi_exit_procfs(); cleanup_queue: scsi_exit_queue(); printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n", -error); return error; } static void __exit exit_scsi(void) { scsi_netlink_exit(); scsi_sysfs_unregister(); scsi_exit_sysctl(); scsi_exit_hosts(); scsi_exit_devinfo(); scsi_exit_procfs(); scsi_exit_queue(); } subsys_initcall(init_scsi); module_exit(exit_scsi); |
4 2 2 1 1 5 10 5 4 1 9 10 2 9 9 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Tom Herbert <tom@herbertland.com> */ #include <linux/skbuff.h> #include <linux/skbuff_ref.h> #include <linux/workqueue.h> #include <net/strparser.h> #include <net/tcp.h> #include <net/sock.h> #include <net/tls.h> #include "tls.h" static struct workqueue_struct *tls_strp_wq; static void tls_strp_abort_strp(struct tls_strparser *strp, int err) { if (strp->stopped) return; strp->stopped = 1; /* Report an error on the lower socket */ WRITE_ONCE(strp->sk->sk_err, -err); /* Paired with smp_rmb() in tcp_poll() */ smp_wmb(); sk_error_report(strp->sk); } static void tls_strp_anchor_free(struct tls_strparser *strp) { struct skb_shared_info *shinfo = skb_shinfo(strp->anchor); DEBUG_NET_WARN_ON_ONCE(atomic_read(&shinfo->dataref) != 1); if (!strp->copy_mode) shinfo->frag_list = NULL; consume_skb(strp->anchor); strp->anchor = NULL; } static struct sk_buff * tls_strp_skb_copy(struct tls_strparser *strp, struct sk_buff *in_skb, int offset, int len) { struct sk_buff *skb; int i, err; skb = alloc_skb_with_frags(0, len, TLS_PAGE_ORDER, &err, strp->sk->sk_allocation); if (!skb) return NULL; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; WARN_ON_ONCE(skb_copy_bits(in_skb, offset, skb_frag_address(frag), skb_frag_size(frag))); offset += skb_frag_size(frag); } skb->len = len; skb->data_len = len; skb_copy_header(skb, in_skb); return skb; } /* Create a new skb with the contents of input copied to its page frags */ static struct sk_buff *tls_strp_msg_make_copy(struct tls_strparser *strp) { struct strp_msg *rxm; struct sk_buff *skb; skb = tls_strp_skb_copy(strp, strp->anchor, strp->stm.offset, strp->stm.full_len); if (!skb) return NULL; rxm = strp_msg(skb); rxm->offset = 0; return skb; } /* Steal the input skb, input msg is invalid after calling this function */ struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx) { struct tls_strparser *strp = &ctx->strp; #ifdef CONFIG_TLS_DEVICE DEBUG_NET_WARN_ON_ONCE(!strp->anchor->decrypted); #else /* This function turns an input into an output, * that can only happen if we have offload. */ WARN_ON(1); #endif if (strp->copy_mode) { struct sk_buff *skb; /* Replace anchor with an empty skb, this is a little * dangerous but __tls_cur_msg() warns on empty skbs * so hopefully we'll catch abuses. */ skb = alloc_skb(0, strp->sk->sk_allocation); if (!skb) return NULL; swap(strp->anchor, skb); return skb; } return tls_strp_msg_make_copy(strp); } /* Force the input skb to be in copy mode. The data ownership remains * with the input skb itself (meaning unpause will wipe it) but it can * be modified. */ int tls_strp_msg_cow(struct tls_sw_context_rx *ctx) { struct tls_strparser *strp = &ctx->strp; struct sk_buff *skb; if (strp->copy_mode) return 0; skb = tls_strp_msg_make_copy(strp); if (!skb) return -ENOMEM; tls_strp_anchor_free(strp); strp->anchor = skb; tcp_read_done(strp->sk, strp->stm.full_len); strp->copy_mode = 1; return 0; } /* Make a clone (in the skb sense) of the input msg to keep a reference * to the underlying data. The reference-holding skbs get placed on * @dst. */ int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst) { struct skb_shared_info *shinfo = skb_shinfo(strp->anchor); if (strp->copy_mode) { struct sk_buff *skb; WARN_ON_ONCE(!shinfo->nr_frags); /* We can't skb_clone() the anchor, it gets wiped by unpause */ skb = alloc_skb(0, strp->sk->sk_allocation); if (!skb) return -ENOMEM; __skb_queue_tail(dst, strp->anchor); strp->anchor = skb; } else { struct sk_buff *iter, *clone; int chunk, len, offset; offset = strp->stm.offset; len = strp->stm.full_len; iter = shinfo->frag_list; while (len > 0) { if (iter->len <= offset) { offset -= iter->len; goto next; } chunk = iter->len - offset; offset = 0; clone = skb_clone(iter, strp->sk->sk_allocation); if (!clone) return -ENOMEM; __skb_queue_tail(dst, clone); len -= chunk; next: iter = iter->next; } } return 0; } static void tls_strp_flush_anchor_copy(struct tls_strparser *strp) { struct skb_shared_info *shinfo = skb_shinfo(strp->anchor); int i; DEBUG_NET_WARN_ON_ONCE(atomic_read(&shinfo->dataref) != 1); for (i = 0; i < shinfo->nr_frags; i++) __skb_frag_unref(&shinfo->frags[i], false); shinfo->nr_frags = 0; if (strp->copy_mode) { kfree_skb_list(shinfo->frag_list); shinfo->frag_list = NULL; } strp->copy_mode = 0; strp->mixed_decrypted = 0; } static int tls_strp_copyin_frag(struct tls_strparser *strp, struct sk_buff *skb, struct sk_buff *in_skb, unsigned int offset, size_t in_len) { size_t len, chunk; skb_frag_t *frag; int sz; frag = &skb_shinfo(skb)->frags[skb->len / PAGE_SIZE]; len = in_len; /* First make sure we got the header */ if (!strp->stm.full_len) { /* Assume one page is more than enough for headers */ chunk = min_t(size_t, len, PAGE_SIZE - skb_frag_size(frag)); WARN_ON_ONCE(skb_copy_bits(in_skb, offset, skb_frag_address(frag) + skb_frag_size(frag), chunk)); skb->len += chunk; skb->data_len += chunk; skb_frag_size_add(frag, chunk); sz = tls_rx_msg_size(strp, skb); if (sz < 0) return sz; /* We may have over-read, sz == 0 is guaranteed under-read */ if (unlikely(sz && sz < skb->len)) { int over = skb->len - sz; WARN_ON_ONCE(over > chunk); skb->len -= over; skb->data_len -= over; skb_frag_size_add(frag, -over); chunk -= over; } frag++; len -= chunk; offset += chunk; strp->stm.full_len = sz; if (!strp->stm.full_len) goto read_done; } /* Load up more data */ while (len && strp->stm.full_len > skb->len) { chunk = min_t(size_t, len, strp->stm.full_len - skb->len); chunk = min_t(size_t, chunk, PAGE_SIZE - skb_frag_size(frag)); WARN_ON_ONCE(skb_copy_bits(in_skb, offset, skb_frag_address(frag) + skb_frag_size(frag), chunk)); skb->len += chunk; skb->data_len += chunk; skb_frag_size_add(frag, chunk); frag++; len -= chunk; offset += chunk; } read_done: return in_len - len; } static int tls_strp_copyin_skb(struct tls_strparser *strp, struct sk_buff *skb, struct sk_buff *in_skb, unsigned int offset, size_t in_len) { struct sk_buff *nskb, *first, *last; struct skb_shared_info *shinfo; size_t chunk; int sz; if (strp->stm.full_len) chunk = strp->stm.full_len - skb->len; else chunk = TLS_MAX_PAYLOAD_SIZE + PAGE_SIZE; chunk = min(chunk, in_len); nskb = tls_strp_skb_copy(strp, in_skb, offset, chunk); if (!nskb) return -ENOMEM; shinfo = skb_shinfo(skb); if (!shinfo->frag_list) { shinfo->frag_list = nskb; nskb->prev = nskb; } else { first = shinfo->frag_list; last = first->prev; last->next = nskb; first->prev = nskb; } skb->len += chunk; skb->data_len += chunk; if (!strp->stm.full_len) { sz = tls_rx_msg_size(strp, skb); if (sz < 0) return sz; /* We may have over-read, sz == 0 is guaranteed under-read */ if (unlikely(sz && sz < skb->len)) { int over = skb->len - sz; WARN_ON_ONCE(over > chunk); skb->len -= over; skb->data_len -= over; __pskb_trim(nskb, nskb->len - over); chunk -= over; } strp->stm.full_len = sz; } return chunk; } static int tls_strp_copyin(read_descriptor_t *desc, struct sk_buff *in_skb, unsigned int offset, size_t in_len) { struct tls_strparser *strp = (struct tls_strparser *)desc->arg.data; struct sk_buff *skb; int ret; if (strp->msg_ready) return 0; skb = strp->anchor; if (!skb->len) skb_copy_decrypted(skb, in_skb); else strp->mixed_decrypted |= !!skb_cmp_decrypted(skb, in_skb); if (IS_ENABLED(CONFIG_TLS_DEVICE) && strp->mixed_decrypted) ret = tls_strp_copyin_skb(strp, skb, in_skb, offset, in_len); else ret = tls_strp_copyin_frag(strp, skb, in_skb, offset, in_len); if (ret < 0) { desc->error = ret; ret = 0; } if (strp->stm.full_len && strp->stm.full_len == skb->len) { desc->count = 0; WRITE_ONCE(strp->msg_ready, 1); tls_rx_msg_ready(strp); } return ret; } static int tls_strp_read_copyin(struct tls_strparser *strp) { read_descriptor_t desc; desc.arg.data = strp; desc.error = 0; desc.count = 1; /* give more than one skb per call */ /* sk should be locked here, so okay to do read_sock */ tcp_read_sock(strp->sk, &desc, tls_strp_copyin); return desc.error; } static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) { struct skb_shared_info *shinfo; struct page *page; int need_spc, len; /* If the rbuf is small or rcv window has collapsed to 0 we need * to read the data out. Otherwise the connection will stall. * Without pressure threshold of INT_MAX will never be ready. */ if (likely(qshort && !tcp_epollin_ready(strp->sk, INT_MAX))) return 0; shinfo = skb_shinfo(strp->anchor); shinfo->frag_list = NULL; /* If we don't know the length go max plus page for cipher overhead */ need_spc = strp->stm.full_len ?: TLS_MAX_PAYLOAD_SIZE + PAGE_SIZE; for (len = need_spc; len > 0; len -= PAGE_SIZE) { page = alloc_page(strp->sk->sk_allocation); if (!page) { tls_strp_flush_anchor_copy(strp); return -ENOMEM; } skb_fill_page_desc(strp->anchor, shinfo->nr_frags++, page, 0, 0); } strp->copy_mode = 1; strp->stm.offset = 0; strp->anchor->len = 0; strp->anchor->data_len = 0; strp->anchor->truesize = round_up(need_spc, PAGE_SIZE); tls_strp_read_copyin(strp); return 0; } static bool tls_strp_check_queue_ok(struct tls_strparser *strp) { unsigned int len = strp->stm.offset + strp->stm.full_len; struct sk_buff *first, *skb; u32 seq; first = skb_shinfo(strp->anchor)->frag_list; skb = first; seq = TCP_SKB_CB(first)->seq; /* Make sure there's no duplicate data in the queue, * and the decrypted status matches. */ while (skb->len < len) { seq += skb->len; len -= skb->len; skb = skb->next; if (TCP_SKB_CB(skb)->seq != seq) return false; if (skb_cmp_decrypted(first, skb)) return false; } return true; } static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len) { struct tcp_sock *tp = tcp_sk(strp->sk); struct sk_buff *first; u32 offset; first = tcp_recv_skb(strp->sk, tp->copied_seq, &offset); if (WARN_ON_ONCE(!first)) return; /* Bestow the state onto the anchor */ strp->anchor->len = offset + len; strp->anchor->data_len = offset + len; strp->anchor->truesize = offset + len; skb_shinfo(strp->anchor)->frag_list = first; skb_copy_header(strp->anchor, first); strp->anchor->destructor = NULL; strp->stm.offset = offset; } void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) { struct strp_msg *rxm; struct tls_msg *tlm; DEBUG_NET_WARN_ON_ONCE(!strp->msg_ready); DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len); if (!strp->copy_mode && force_refresh) { if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len)) return; tls_strp_load_anchor_with_queue(strp, strp->stm.full_len); } rxm = strp_msg(strp->anchor); rxm->full_len = strp->stm.full_len; rxm->offset = strp->stm.offset; tlm = tls_msg(strp->anchor); tlm->control = strp->mark; } /* Called with lock held on lower socket */ static int tls_strp_read_sock(struct tls_strparser *strp) { int sz, inq; inq = tcp_inq(strp->sk); if (inq < 1) return 0; if (unlikely(strp->copy_mode)) return tls_strp_read_copyin(strp); if (inq < strp->stm.full_len) return tls_strp_read_copy(strp, true); if (!strp->stm.full_len) { tls_strp_load_anchor_with_queue(strp, inq); sz = tls_rx_msg_size(strp, strp->anchor); if (sz < 0) { tls_strp_abort_strp(strp, sz); return sz; } strp->stm.full_len = sz; if (!strp->stm.full_len || inq < strp->stm.full_len) return tls_strp_read_copy(strp, true); } if (!tls_strp_check_queue_ok(strp)) return tls_strp_read_copy(strp, false); WRITE_ONCE(strp->msg_ready, 1); tls_rx_msg_ready(strp); return 0; } void tls_strp_check_rcv(struct tls_strparser *strp) { if (unlikely(strp->stopped) || strp->msg_ready) return; if (tls_strp_read_sock(strp) == -ENOMEM) queue_work(tls_strp_wq, &strp->work); } /* Lower sock lock held */ void tls_strp_data_ready(struct tls_strparser *strp) { /* This check is needed to synchronize with do_tls_strp_work. * do_tls_strp_work acquires a process lock (lock_sock) whereas * the lock held here is bh_lock_sock. The two locks can be * held by different threads at the same time, but bh_lock_sock * allows a thread in BH context to safely check if the process * lock is held. In this case, if the lock is held, queue work. */ if (sock_owned_by_user_nocheck(strp->sk)) { queue_work(tls_strp_wq, &strp->work); return; } tls_strp_check_rcv(strp); } static void tls_strp_work(struct work_struct *w) { struct tls_strparser *strp = container_of(w, struct tls_strparser, work); lock_sock(strp->sk); tls_strp_check_rcv(strp); release_sock(strp->sk); } void tls_strp_msg_done(struct tls_strparser *strp) { WARN_ON(!strp->stm.full_len); if (likely(!strp->copy_mode)) tcp_read_done(strp->sk, strp->stm.full_len); else tls_strp_flush_anchor_copy(strp); WRITE_ONCE(strp->msg_ready, 0); memset(&strp->stm, 0, sizeof(strp->stm)); tls_strp_check_rcv(strp); } void tls_strp_stop(struct tls_strparser *strp) { strp->stopped = 1; } int tls_strp_init(struct tls_strparser *strp, struct sock *sk) { memset(strp, 0, sizeof(*strp)); strp->sk = sk; strp->anchor = alloc_skb(0, GFP_KERNEL); if (!strp->anchor) return -ENOMEM; INIT_WORK(&strp->work, tls_strp_work); return 0; } /* strp must already be stopped so that tls_strp_recv will no longer be called. * Note that tls_strp_done is not called with the lower socket held. */ void tls_strp_done(struct tls_strparser *strp) { WARN_ON(!strp->stopped); cancel_work_sync(&strp->work); tls_strp_anchor_free(strp); } int __init tls_strp_dev_init(void) { tls_strp_wq = create_workqueue("tls-strp"); if (unlikely(!tls_strp_wq)) return -ENOMEM; return 0; } void tls_strp_dev_exit(void) { destroy_workqueue(tls_strp_wq); } |
1 2 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2021 Facebook */ #ifndef __MMAP_UNLOCK_WORK_H__ #define __MMAP_UNLOCK_WORK_H__ #include <linux/irq_work.h> /* irq_work to run mmap_read_unlock() in irq_work */ struct mmap_unlock_irq_work { struct irq_work irq_work; struct mm_struct *mm; }; DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); /* * We cannot do mmap_read_unlock() when the irq is disabled, because of * risk to deadlock with rq_lock. To look up vma when the irqs are * disabled, we need to run mmap_read_unlock() in irq_work. We use a * percpu variable to do the irq_work. If the irq_work is already used * by another lookup, we fall over. */ static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr) { struct mmap_unlock_irq_work *work = NULL; bool irq_work_busy = false; if (irqs_disabled()) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { work = this_cpu_ptr(&mmap_unlock_work); if (irq_work_is_busy(&work->irq_work)) { /* cannot queue more up_read, fallback */ irq_work_busy = true; } } else { /* * PREEMPT_RT does not allow to trylock mmap sem in * interrupt disabled context. Force the fallback code. */ irq_work_busy = true; } } *work_ptr = work; return irq_work_busy; } static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm) { if (!work) { mmap_read_unlock(mm); } else { work->mm = mm; /* The lock will be released once we're out of interrupt * context. Tell lockdep that we've released it now so * it doesn't complain that we forgot to release it. */ rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); irq_work_queue(&work->irq_work); } } #endif /* __MMAP_UNLOCK_WORK_H__ */ |
2 2 7 8 1 1 1 1 1 1 2 1 2 3 3 4 4 4 1 1 4 1 1 1 1 10 10 1 1 2 2 1 1 1 1 1 1 9 10 10 9 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 | // SPDX-License-Identifier: GPL-2.0-only /* * File: pn_netlink.c * * Phonet netlink interface * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Remi Denis-Courmont */ #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/phonet.h> #include <linux/slab.h> #include <net/sock.h> #include <net/phonet/pn_dev.h> /* Device address handling */ static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr, u32 portid, u32 seq, int event); void phonet_address_notify(int event, struct net_device *dev, u8 addr) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(1), GFP_KERNEL); if (skb == NULL) goto errout; err = fill_addr(skb, dev, addr, 0, 0, event); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, dev_net(dev), 0, RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err); } static const struct nla_policy ifa_phonet_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U8 }, }; static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; struct net_device *dev; struct ifaddrmsg *ifm; int err; u8 pnaddr; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; ASSERT_RTNL(); err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_phonet_policy, extack); if (err < 0) return err; ifm = nlmsg_data(nlh); if (tb[IFA_LOCAL] == NULL) return -EINVAL; pnaddr = nla_get_u8(tb[IFA_LOCAL]); if (pnaddr & 3) /* Phonet addresses only have 6 high-order bits */ return -EINVAL; dev = __dev_get_by_index(net, ifm->ifa_index); if (dev == NULL) return -ENODEV; if (nlh->nlmsg_type == RTM_NEWADDR) err = phonet_address_add(dev, pnaddr); else err = phonet_address_del(dev, pnaddr); if (!err) phonet_address_notify(nlh->nlmsg_type, dev, pnaddr); return err; } static int fill_addr(struct sk_buff *skb, struct net_device *dev, u8 addr, u32 portid, u32 seq, int event) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), 0); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_PHONET; ifm->ifa_prefixlen = 0; ifm->ifa_flags = IFA_F_PERMANENT; ifm->ifa_scope = RT_SCOPE_LINK; ifm->ifa_index = dev->ifindex; if (nla_put_u8(skb, IFA_LOCAL, addr)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int getaddr_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct phonet_device_list *pndevs; struct phonet_device *pnd; int dev_idx = 0, dev_start_idx = cb->args[0]; int addr_idx = 0, addr_start_idx = cb->args[1]; pndevs = phonet_device_list(sock_net(skb->sk)); rcu_read_lock(); list_for_each_entry_rcu(pnd, &pndevs->list, list) { u8 addr; if (dev_idx > dev_start_idx) addr_start_idx = 0; if (dev_idx++ < dev_start_idx) continue; addr_idx = 0; for_each_set_bit(addr, pnd->addrs, 64) { if (addr_idx++ < addr_start_idx) continue; if (fill_addr(skb, pnd->netdev, addr << 2, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWADDR) < 0) goto out; } } out: rcu_read_unlock(); cb->args[0] = dev_idx; cb->args[1] = addr_idx; return skb->len; } /* Routes handling */ static int fill_route(struct sk_buff *skb, struct net_device *dev, u8 dst, u32 portid, u32 seq, int event) { struct rtmsg *rtm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), 0); if (nlh == NULL) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = AF_PHONET; rtm->rtm_dst_len = 6; rtm->rtm_src_len = 0; rtm->rtm_tos = 0; rtm->rtm_table = RT_TABLE_MAIN; rtm->rtm_protocol = RTPROT_STATIC; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; if (nla_put_u8(skb, RTA_DST, dst) || nla_put_u32(skb, RTA_OIF, READ_ONCE(dev->ifindex))) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void rtm_phonet_notify(int event, struct net_device *dev, u8 dst) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(1) + nla_total_size(4), GFP_KERNEL); if (skb == NULL) goto errout; err = fill_route(skb, dev, dst, 0, 0, event); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, dev_net(dev), 0, RTNLGRP_PHONET_ROUTE, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_ROUTE, err); } static const struct nla_policy rtm_phonet_policy[RTA_MAX+1] = { [RTA_DST] = { .type = NLA_U8 }, [RTA_OIF] = { .type = NLA_U32 }, }; static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[RTA_MAX+1]; struct net_device *dev; struct rtmsg *rtm; int err; u8 dst; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; ASSERT_RTNL(); err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_phonet_policy, extack); if (err < 0) return err; rtm = nlmsg_data(nlh); if (rtm->rtm_table != RT_TABLE_MAIN || rtm->rtm_type != RTN_UNICAST) return -EINVAL; if (tb[RTA_DST] == NULL || tb[RTA_OIF] == NULL) return -EINVAL; dst = nla_get_u8(tb[RTA_DST]); if (dst & 3) /* Phonet addresses only have 6 high-order bits */ return -EINVAL; dev = __dev_get_by_index(net, nla_get_u32(tb[RTA_OIF])); if (dev == NULL) return -ENODEV; if (nlh->nlmsg_type == RTM_NEWROUTE) err = phonet_route_add(dev, dst); else err = phonet_route_del(dev, dst); if (!err) rtm_phonet_notify(nlh->nlmsg_type, dev, dst); return err; } static int route_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int err = 0; u8 addr; rcu_read_lock(); for (addr = cb->args[0]; addr < 64; addr++) { struct net_device *dev = phonet_route_get_rcu(net, addr << 2); if (!dev) continue; err = fill_route(skb, dev, addr << 2, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE); if (err < 0) break; } rcu_read_unlock(); cb->args[0] = addr; return err; } static const struct rtnl_msg_handler phonet_rtnl_msg_handlers[] __initdata_or_module = { {THIS_MODULE, PF_PHONET, RTM_NEWADDR, addr_doit, NULL, 0}, {THIS_MODULE, PF_PHONET, RTM_DELADDR, addr_doit, NULL, 0}, {THIS_MODULE, PF_PHONET, RTM_GETADDR, NULL, getaddr_dumpit, 0}, {THIS_MODULE, PF_PHONET, RTM_NEWROUTE, route_doit, NULL, 0}, {THIS_MODULE, PF_PHONET, RTM_DELROUTE, route_doit, NULL, 0}, {THIS_MODULE, PF_PHONET, RTM_GETROUTE, NULL, route_dumpit, RTNL_FLAG_DUMP_UNLOCKED}, }; int __init phonet_netlink_register(void) { return rtnl_register_many(phonet_rtnl_msg_handlers); } |
3 3 3 3 3 3 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 | // SPDX-License-Identifier: GPL-2.0-only /* * spectrum management * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2008, Intel Corporation * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2018, 2020, 2022-2024 Intel Corporation */ #include <linux/ieee80211.h> #include <net/cfg80211.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "sta_info.h" #include "wme.h" static bool wbcs_elem_to_chandef(const struct ieee80211_wide_bw_chansw_ie *wbcs_elem, struct cfg80211_chan_def *chandef) { u8 ccfs0 = wbcs_elem->new_center_freq_seg0; u8 ccfs1 = wbcs_elem->new_center_freq_seg1; u32 cf0 = ieee80211_channel_to_frequency(ccfs0, chandef->chan->band); u32 cf1 = ieee80211_channel_to_frequency(ccfs1, chandef->chan->band); switch (wbcs_elem->new_channel_width) { case IEEE80211_VHT_CHANWIDTH_160MHZ: /* deprecated encoding */ chandef->width = NL80211_CHAN_WIDTH_160; chandef->center_freq1 = cf0; break; case IEEE80211_VHT_CHANWIDTH_80P80MHZ: /* deprecated encoding */ chandef->width = NL80211_CHAN_WIDTH_80P80; chandef->center_freq1 = cf0; chandef->center_freq2 = cf1; break; case IEEE80211_VHT_CHANWIDTH_80MHZ: chandef->width = NL80211_CHAN_WIDTH_80; chandef->center_freq1 = cf0; if (ccfs1) { u8 diff = abs(ccfs0 - ccfs1); if (diff == 8) { chandef->width = NL80211_CHAN_WIDTH_160; chandef->center_freq1 = cf1; } else if (diff > 8) { chandef->width = NL80211_CHAN_WIDTH_80P80; chandef->center_freq2 = cf1; } } break; case IEEE80211_VHT_CHANWIDTH_USE_HT: default: /* If the WBCS Element is present, new channel bandwidth is * at least 40 MHz. */ chandef->width = NL80211_CHAN_WIDTH_40; chandef->center_freq1 = cf0; break; } return cfg80211_chandef_valid(chandef); } static void validate_chandef_by_ht_vht_oper(struct ieee80211_sub_if_data *sdata, struct ieee80211_conn_settings *conn, u32 vht_cap_info, struct cfg80211_chan_def *chandef) { u32 control_freq, center_freq1, center_freq2; enum nl80211_chan_width chan_width; struct ieee80211_ht_operation ht_oper; struct ieee80211_vht_operation vht_oper; if (conn->mode < IEEE80211_CONN_MODE_HT || conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) { chandef->chan = NULL; return; } control_freq = chandef->chan->center_freq; center_freq1 = chandef->center_freq1; center_freq2 = chandef->center_freq2; chan_width = chandef->width; ht_oper.primary_chan = ieee80211_frequency_to_channel(control_freq); if (control_freq != center_freq1) ht_oper.ht_param = control_freq > center_freq1 ? IEEE80211_HT_PARAM_CHA_SEC_BELOW : IEEE80211_HT_PARAM_CHA_SEC_ABOVE; else ht_oper.ht_param = IEEE80211_HT_PARAM_CHA_SEC_NONE; ieee80211_chandef_ht_oper(&ht_oper, chandef); if (conn->mode < IEEE80211_CONN_MODE_VHT) return; vht_oper.center_freq_seg0_idx = ieee80211_frequency_to_channel(center_freq1); vht_oper.center_freq_seg1_idx = center_freq2 ? ieee80211_frequency_to_channel(center_freq2) : 0; switch (chan_width) { case NL80211_CHAN_WIDTH_320: WARN_ON(1); break; case NL80211_CHAN_WIDTH_160: vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; vht_oper.center_freq_seg1_idx = vht_oper.center_freq_seg0_idx; vht_oper.center_freq_seg0_idx += control_freq < center_freq1 ? -8 : 8; break; case NL80211_CHAN_WIDTH_80P80: vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_80: vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; break; default: vht_oper.chan_width = IEEE80211_VHT_CHANWIDTH_USE_HT; break; } ht_oper.operation_mode = le16_encode_bits(vht_oper.center_freq_seg1_idx, IEEE80211_HT_OP_MODE_CCFS2_MASK); if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, &vht_oper, &ht_oper, chandef)) chandef->chan = NULL; } static void validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata, struct ieee80211_conn_settings *conn, struct cfg80211_chan_def *chandef) { struct ieee80211_local *local = sdata->local; u32 control_freq, center_freq1, center_freq2; enum nl80211_chan_width chan_width; struct { struct ieee80211_he_operation _oper; struct ieee80211_he_6ghz_oper _6ghz_oper; } __packed he; struct { struct ieee80211_eht_operation _oper; struct ieee80211_eht_operation_info _oper_info; } __packed eht; const struct ieee80211_eht_operation *eht_oper; if (conn->mode < IEEE80211_CONN_MODE_HE) { chandef->chan = NULL; return; } control_freq = chandef->chan->center_freq; center_freq1 = chandef->center_freq1; center_freq2 = chandef->center_freq2; chan_width = chandef->width; he._oper.he_oper_params = le32_encode_bits(1, IEEE80211_HE_OPERATION_6GHZ_OP_INFO); he._6ghz_oper.primary = ieee80211_frequency_to_channel(control_freq); he._6ghz_oper.ccfs0 = ieee80211_frequency_to_channel(center_freq1); he._6ghz_oper.ccfs1 = center_freq2 ? ieee80211_frequency_to_channel(center_freq2) : 0; switch (chan_width) { case NL80211_CHAN_WIDTH_320: he._6ghz_oper.ccfs1 = he._6ghz_oper.ccfs0; he._6ghz_oper.ccfs0 += control_freq < center_freq1 ? -16 : 16; he._6ghz_oper.control = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ; break; case NL80211_CHAN_WIDTH_160: he._6ghz_oper.ccfs1 = he._6ghz_oper.ccfs0; he._6ghz_oper.ccfs0 += control_freq < center_freq1 ? -8 : 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: he._6ghz_oper.control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: he._6ghz_oper.control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: he._6ghz_oper.control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; break; default: he._6ghz_oper.control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; break; } if (conn->mode < IEEE80211_CONN_MODE_EHT) { eht_oper = NULL; } else { eht._oper.params = IEEE80211_EHT_OPER_INFO_PRESENT; eht._oper_info.control = he._6ghz_oper.control; eht._oper_info.ccfs0 = he._6ghz_oper.ccfs0; eht._oper_info.ccfs1 = he._6ghz_oper.ccfs1; eht_oper = &eht._oper; } if (!ieee80211_chandef_he_6ghz_oper(local, &he._oper, eht_oper, chandef)) chandef->chan = NULL; } int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, u32 vht_cap_info, struct ieee80211_conn_settings *conn, u8 *bssid, bool unprot_action, struct ieee80211_csa_ie *csa_ie) { enum nl80211_band new_band = current_band; int new_freq; u8 new_chan_no = 0, new_op_class = 0; struct ieee80211_channel *new_chan; struct cfg80211_chan_def new_chandef = {}; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; const struct ieee80211_bandwidth_indication *bwi; const struct ieee80211_ext_chansw_ie *ext_chansw_elem; int secondary_channel_offset = -1; memset(csa_ie, 0, sizeof(*csa_ie)); sec_chan_offs = elems->sec_chan_offs; wide_bw_chansw_ie = elems->wide_bw_chansw_ie; bwi = elems->bandwidth_indication; ext_chansw_elem = elems->ext_chansw_ie; if (conn->mode < IEEE80211_CONN_MODE_HT || conn->bw_limit < IEEE80211_CONN_BW_LIMIT_40) { sec_chan_offs = NULL; wide_bw_chansw_ie = NULL; } if (conn->mode < IEEE80211_CONN_MODE_VHT) wide_bw_chansw_ie = NULL; if (ext_chansw_elem) { new_op_class = ext_chansw_elem->new_operating_class; if (!ieee80211_operating_class_to_band(new_op_class, &new_band)) { new_op_class = 0; if (!unprot_action) sdata_info(sdata, "cannot understand ECSA IE operating class, %d, ignoring\n", ext_chansw_elem->new_operating_class); } else { new_chan_no = ext_chansw_elem->new_ch_num; csa_ie->count = ext_chansw_elem->count; csa_ie->mode = ext_chansw_elem->mode; } } if (!new_op_class && elems->ch_switch_ie) { new_chan_no = elems->ch_switch_ie->new_ch_num; csa_ie->count = elems->ch_switch_ie->count; csa_ie->mode = elems->ch_switch_ie->mode; } /* nothing here we understand */ if (!new_chan_no) return 1; /* Mesh Channel Switch Parameters Element */ if (elems->mesh_chansw_params_ie) { csa_ie->ttl = elems->mesh_chansw_params_ie->mesh_ttl; csa_ie->mode = elems->mesh_chansw_params_ie->mesh_flags; csa_ie->pre_value = le16_to_cpu( elems->mesh_chansw_params_ie->mesh_pre_value); if (elems->mesh_chansw_params_ie->mesh_flags & WLAN_EID_CHAN_SWITCH_PARAM_REASON) csa_ie->reason_code = le16_to_cpu( elems->mesh_chansw_params_ie->mesh_reason); } new_freq = ieee80211_channel_to_frequency(new_chan_no, new_band); new_chan = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq); if (!new_chan || new_chan->flags & IEEE80211_CHAN_DISABLED) { if (!unprot_action) sdata_info(sdata, "BSS %pM switches to unsupported channel (%d MHz), disconnecting\n", bssid, new_freq); return -EINVAL; } if (sec_chan_offs) { secondary_channel_offset = sec_chan_offs->sec_chan_offs; } else if (conn->mode >= IEEE80211_CONN_MODE_HT) { /* If the secondary channel offset IE is not present, * we can't know what's the post-CSA offset, so the * best we can do is use 20MHz. */ secondary_channel_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE; } switch (secondary_channel_offset) { default: /* secondary_channel_offset was present but is invalid */ case IEEE80211_HT_PARAM_CHA_SEC_NONE: cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan, NL80211_CHAN_HT20); break; case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan, NL80211_CHAN_HT40PLUS); break; case IEEE80211_HT_PARAM_CHA_SEC_BELOW: cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan, NL80211_CHAN_HT40MINUS); break; case -1: cfg80211_chandef_create(&csa_ie->chanreq.oper, new_chan, NL80211_CHAN_NO_HT); /* keep width for 5/10 MHz channels */ switch (sdata->vif.bss_conf.chanreq.oper.width) { case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: csa_ie->chanreq.oper.width = sdata->vif.bss_conf.chanreq.oper.width; break; default: break; } break; } /* capture the AP configuration */ csa_ie->chanreq.ap = csa_ie->chanreq.oper; /* parse one of the Elements to build a new chandef */ memset(&new_chandef, 0, sizeof(new_chandef)); new_chandef.chan = new_chan; if (bwi) { /* start with the CSA one */ new_chandef = csa_ie->chanreq.oper; /* and update the width accordingly */ ieee80211_chandef_eht_oper(&bwi->info, &new_chandef); if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT) new_chandef.punctured = get_unaligned_le16(bwi->info.optional); } else if (!wide_bw_chansw_ie || !wbcs_elem_to_chandef(wide_bw_chansw_ie, &new_chandef)) { if (!ieee80211_operating_class_to_chandef(new_op_class, new_chan, &new_chandef)) new_chandef = csa_ie->chanreq.oper; } /* check if the new chandef fits the capabilities */ if (new_band == NL80211_BAND_6GHZ) validate_chandef_by_6ghz_he_eht_oper(sdata, conn, &new_chandef); else validate_chandef_by_ht_vht_oper(sdata, conn, vht_cap_info, &new_chandef); /* if data is there validate the bandwidth & use it */ if (new_chandef.chan) { /* capture the AP chandef before (potential) downgrading */ csa_ie->chanreq.ap = new_chandef; if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_320 && new_chandef.width == NL80211_CHAN_WIDTH_320) ieee80211_chandef_downgrade(&new_chandef, NULL); if (conn->bw_limit < IEEE80211_CONN_BW_LIMIT_160 && (new_chandef.width == NL80211_CHAN_WIDTH_80P80 || new_chandef.width == NL80211_CHAN_WIDTH_160)) ieee80211_chandef_downgrade(&new_chandef, NULL); if (!cfg80211_chandef_compatible(&new_chandef, &csa_ie->chanreq.oper)) { sdata_info(sdata, "BSS %pM: CSA has inconsistent channel data, disconnecting\n", bssid); return -EINVAL; } csa_ie->chanreq.oper = new_chandef; } if (elems->max_channel_switch_time) csa_ie->max_switch_time = (elems->max_channel_switch_time[0] << 0) | (elems->max_channel_switch_time[1] << 8) | (elems->max_channel_switch_time[2] << 16); return 0; } static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_data *sdata, struct ieee80211_msrment_ie *request_ie, const u8 *da, const u8 *bssid, u8 dialog_token) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *msr_report; skb = dev_alloc_skb(sizeof(*msr_report) + local->hw.extra_tx_headroom + sizeof(struct ieee80211_msrment_ie)); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); msr_report = skb_put_zero(skb, 24); memcpy(msr_report->da, da, ETH_ALEN); memcpy(msr_report->sa, sdata->vif.addr, ETH_ALEN); memcpy(msr_report->bssid, bssid, ETH_ALEN); msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); skb_put(skb, 1 + sizeof(msr_report->u.action.u.measurement)); msr_report->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT; msr_report->u.action.u.measurement.action_code = WLAN_ACTION_SPCT_MSR_RPRT; msr_report->u.action.u.measurement.dialog_token = dialog_token; msr_report->u.action.u.measurement.element_id = WLAN_EID_MEASURE_REPORT; msr_report->u.action.u.measurement.length = sizeof(struct ieee80211_msrment_ie); memset(&msr_report->u.action.u.measurement.msr_elem, 0, sizeof(struct ieee80211_msrment_ie)); msr_report->u.action.u.measurement.msr_elem.token = request_ie->token; msr_report->u.action.u.measurement.msr_elem.mode |= IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED; msr_report->u.action.u.measurement.msr_elem.type = request_ie->type; ieee80211_tx_skb(sdata, skb); } void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { /* * Ignoring measurement request is spec violation. * Mandatory measurements must be reported optional * measurements might be refused or reported incapable * For now just refuse * TODO: Answer basic measurement as unmeasured */ ieee80211_send_refuse_measurement_request(sdata, &mgmt->u.action.u.measurement.msr_elem, mgmt->sa, mgmt->bssid, mgmt->u.action.u.measurement.dialog_token); } |
4 4 5 3 1 1 3 3 16 16 16 16 1 1 6 1 4 4 4 4 4 4 4 4 4 3 17 2 2 4 4 5 5 1 1 4 1 1 4 4 4 8 2 1 4 2 5 11 1 10 9 2 9 9 1 2 2 8 16 16 16 15 29 30 30 30 30 16 16 16 16 16 16 16 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 | // SPDX-License-Identifier: GPL-2.0-only /* * IEEE 802.1Q Multiple Registration Protocol (MRP) * * Copyright (c) 2012 Massachusetts Institute of Technology * * Adapted from code in net/802/garp.c * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/module.h> #include <net/mrp.h> #include <linux/unaligned.h> static unsigned int mrp_join_time __read_mostly = 200; module_param(mrp_join_time, uint, 0644); MODULE_PARM_DESC(mrp_join_time, "Join time in ms (default 200ms)"); static unsigned int mrp_periodic_time __read_mostly = 1000; module_param(mrp_periodic_time, uint, 0644); MODULE_PARM_DESC(mrp_periodic_time, "Periodic time in ms (default 1s)"); MODULE_DESCRIPTION("IEEE 802.1Q Multiple Registration Protocol (MRP)"); MODULE_LICENSE("GPL"); static const u8 mrp_applicant_state_table[MRP_APPLICANT_MAX + 1][MRP_EVENT_MAX + 1] = { [MRP_APPLICANT_VO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VP, [MRP_EVENT_LV] = MRP_APPLICANT_VO, [MRP_EVENT_TX] = MRP_APPLICANT_VO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AO, [MRP_EVENT_R_IN] = MRP_APPLICANT_VO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VO, [MRP_EVENT_R_MT] = MRP_APPLICANT_VO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VO, }, [MRP_APPLICANT_VP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VP, [MRP_EVENT_LV] = MRP_APPLICANT_VO, [MRP_EVENT_TX] = MRP_APPLICANT_AA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AP, [MRP_EVENT_R_IN] = MRP_APPLICANT_VP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VP, [MRP_EVENT_R_MT] = MRP_APPLICANT_VP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VP, }, [MRP_APPLICANT_VN] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VN, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_AN, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_VN, [MRP_EVENT_R_IN] = MRP_APPLICANT_VN, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VN, [MRP_EVENT_R_MT] = MRP_APPLICANT_VN, [MRP_EVENT_R_LV] = MRP_APPLICANT_VN, [MRP_EVENT_R_LA] = MRP_APPLICANT_VN, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VN, }, [MRP_APPLICANT_AN] = { [MRP_EVENT_NEW] = MRP_APPLICANT_AN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AN, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AN, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AN, [MRP_EVENT_R_IN] = MRP_APPLICANT_AN, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AN, [MRP_EVENT_R_MT] = MRP_APPLICANT_AN, [MRP_EVENT_R_LV] = MRP_APPLICANT_VN, [MRP_EVENT_R_LA] = MRP_APPLICANT_VN, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AN, }, [MRP_APPLICANT_AA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_IN] = MRP_APPLICANT_AA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA, }, [MRP_APPLICANT_QA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA, }, [MRP_APPLICANT_LA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_VO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_LA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_LA, [MRP_EVENT_R_IN] = MRP_APPLICANT_LA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_LA, [MRP_EVENT_R_MT] = MRP_APPLICANT_LA, [MRP_EVENT_R_LV] = MRP_APPLICANT_LA, [MRP_EVENT_R_LA] = MRP_APPLICANT_LA, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_LA, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_LA, }, [MRP_APPLICANT_AO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AP, [MRP_EVENT_LV] = MRP_APPLICANT_AO, [MRP_EVENT_TX] = MRP_APPLICANT_AO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_IN] = MRP_APPLICANT_AO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AO, }, [MRP_APPLICANT_QO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QP, [MRP_EVENT_LV] = MRP_APPLICANT_QO, [MRP_EVENT_TX] = MRP_APPLICANT_QO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_QO, }, [MRP_APPLICANT_AP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AP, [MRP_EVENT_LV] = MRP_APPLICANT_AO, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_IN] = MRP_APPLICANT_AP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP, }, [MRP_APPLICANT_QP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QP, [MRP_EVENT_LV] = MRP_APPLICANT_QO, [MRP_EVENT_TX] = MRP_APPLICANT_QP, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP, }, }; static const u8 mrp_tx_action_table[MRP_APPLICANT_MAX + 1] = { [MRP_APPLICANT_VO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_VP] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_VN] = MRP_TX_ACTION_S_NEW, [MRP_APPLICANT_AN] = MRP_TX_ACTION_S_NEW, [MRP_APPLICANT_AA] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_QA] = MRP_TX_ACTION_S_JOIN_IN_OPTIONAL, [MRP_APPLICANT_LA] = MRP_TX_ACTION_S_LV, [MRP_APPLICANT_AO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_QO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_AP] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_QP] = MRP_TX_ACTION_S_IN_OPTIONAL, }; static void mrp_attrvalue_inc(void *value, u8 len) { u8 *v = (u8 *)value; /* Add 1 to the last byte. If it becomes zero, * go to the previous byte and repeat. */ while (len > 0 && !++v[--len]) ; } static int mrp_attr_cmp(const struct mrp_attr *attr, const void *value, u8 len, u8 type) { if (attr->type != type) return attr->type - type; if (attr->len != len) return attr->len - len; return memcmp(attr->value, value, len); } static struct mrp_attr *mrp_attr_lookup(const struct mrp_applicant *app, const void *value, u8 len, u8 type) { struct rb_node *parent = app->mad.rb_node; struct mrp_attr *attr; int d; while (parent) { attr = rb_entry(parent, struct mrp_attr, node); d = mrp_attr_cmp(attr, value, len, type); if (d > 0) parent = parent->rb_left; else if (d < 0) parent = parent->rb_right; else return attr; } return NULL; } static struct mrp_attr *mrp_attr_create(struct mrp_applicant *app, const void *value, u8 len, u8 type) { struct rb_node *parent = NULL, **p = &app->mad.rb_node; struct mrp_attr *attr; int d; while (*p) { parent = *p; attr = rb_entry(parent, struct mrp_attr, node); d = mrp_attr_cmp(attr, value, len, type); if (d > 0) p = &parent->rb_left; else if (d < 0) p = &parent->rb_right; else { /* The attribute already exists; re-use it. */ return attr; } } attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC); if (!attr) return attr; attr->state = MRP_APPLICANT_VO; attr->type = type; attr->len = len; memcpy(attr->value, value, len); rb_link_node(&attr->node, parent, p); rb_insert_color(&attr->node, &app->mad); return attr; } static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr) { rb_erase(&attr->node, &app->mad); kfree(attr); } static void mrp_attr_destroy_all(struct mrp_applicant *app) { struct rb_node *node, *next; struct mrp_attr *attr; for (node = rb_first(&app->mad); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct mrp_attr, node); mrp_attr_destroy(app, attr); } } static int mrp_pdu_init(struct mrp_applicant *app) { struct sk_buff *skb; struct mrp_pdu_hdr *ph; skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev), GFP_ATOMIC); if (!skb) return -ENOMEM; skb->dev = app->dev; skb->protocol = app->app->pkttype.type; skb_reserve(skb, LL_RESERVED_SPACE(app->dev)); skb_reset_network_header(skb); skb_reset_transport_header(skb); ph = __skb_put(skb, sizeof(*ph)); ph->version = app->app->version; app->pdu = skb; return 0; } static int mrp_pdu_append_end_mark(struct mrp_applicant *app) { __be16 *endmark; if (skb_tailroom(app->pdu) < sizeof(*endmark)) return -1; endmark = __skb_put(app->pdu, sizeof(*endmark)); put_unaligned(MRP_END_MARK, endmark); return 0; } static void mrp_pdu_queue(struct mrp_applicant *app) { if (!app->pdu) return; if (mrp_cb(app->pdu)->mh) mrp_pdu_append_end_mark(app); mrp_pdu_append_end_mark(app); dev_hard_header(app->pdu, app->dev, ntohs(app->app->pkttype.type), app->app->group_address, app->dev->dev_addr, app->pdu->len); skb_queue_tail(&app->queue, app->pdu); app->pdu = NULL; } static void mrp_queue_xmit(struct mrp_applicant *app) { struct sk_buff *skb; while ((skb = skb_dequeue(&app->queue))) dev_queue_xmit(skb); } static int mrp_pdu_append_msg_hdr(struct mrp_applicant *app, u8 attrtype, u8 attrlen) { struct mrp_msg_hdr *mh; if (mrp_cb(app->pdu)->mh) { if (mrp_pdu_append_end_mark(app) < 0) return -1; mrp_cb(app->pdu)->mh = NULL; mrp_cb(app->pdu)->vah = NULL; } if (skb_tailroom(app->pdu) < sizeof(*mh)) return -1; mh = __skb_put(app->pdu, sizeof(*mh)); mh->attrtype = attrtype; mh->attrlen = attrlen; mrp_cb(app->pdu)->mh = mh; return 0; } static int mrp_pdu_append_vecattr_hdr(struct mrp_applicant *app, const void *firstattrvalue, u8 attrlen) { struct mrp_vecattr_hdr *vah; if (skb_tailroom(app->pdu) < sizeof(*vah) + attrlen) return -1; vah = __skb_put(app->pdu, sizeof(*vah) + attrlen); put_unaligned(0, &vah->lenflags); memcpy(vah->firstattrvalue, firstattrvalue, attrlen); mrp_cb(app->pdu)->vah = vah; memcpy(mrp_cb(app->pdu)->attrvalue, firstattrvalue, attrlen); return 0; } static int mrp_pdu_append_vecattr_event(struct mrp_applicant *app, const struct mrp_attr *attr, enum mrp_vecattr_event vaevent) { u16 len, pos; u8 *vaevents; int err; again: if (!app->pdu) { err = mrp_pdu_init(app); if (err < 0) return err; } /* If there is no Message header in the PDU, or the Message header is * for a different attribute type, add an EndMark (if necessary) and a * new Message header to the PDU. */ if (!mrp_cb(app->pdu)->mh || mrp_cb(app->pdu)->mh->attrtype != attr->type || mrp_cb(app->pdu)->mh->attrlen != attr->len) { if (mrp_pdu_append_msg_hdr(app, attr->type, attr->len) < 0) goto queue; } /* If there is no VectorAttribute header for this Message in the PDU, * or this attribute's value does not sequentially follow the previous * attribute's value, add a new VectorAttribute header to the PDU. */ if (!mrp_cb(app->pdu)->vah || memcmp(mrp_cb(app->pdu)->attrvalue, attr->value, attr->len)) { if (mrp_pdu_append_vecattr_hdr(app, attr->value, attr->len) < 0) goto queue; } len = be16_to_cpu(get_unaligned(&mrp_cb(app->pdu)->vah->lenflags)); pos = len % 3; /* Events are packed into Vectors in the PDU, three to a byte. Add a * byte to the end of the Vector if necessary. */ if (!pos) { if (skb_tailroom(app->pdu) < sizeof(u8)) goto queue; vaevents = __skb_put(app->pdu, sizeof(u8)); } else { vaevents = (u8 *)(skb_tail_pointer(app->pdu) - sizeof(u8)); } switch (pos) { case 0: *vaevents = vaevent * (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); break; case 1: *vaevents += vaevent * __MRP_VECATTR_EVENT_MAX; break; case 2: *vaevents += vaevent; break; default: WARN_ON(1); } /* Increment the length of the VectorAttribute in the PDU, as well as * the value of the next attribute that would continue its Vector. */ put_unaligned(cpu_to_be16(++len), &mrp_cb(app->pdu)->vah->lenflags); mrp_attrvalue_inc(mrp_cb(app->pdu)->attrvalue, attr->len); return 0; queue: mrp_pdu_queue(app); goto again; } static void mrp_attr_event(struct mrp_applicant *app, struct mrp_attr *attr, enum mrp_event event) { enum mrp_applicant_state state; state = mrp_applicant_state_table[attr->state][event]; if (state == MRP_APPLICANT_INVALID) { WARN_ON(1); return; } if (event == MRP_EVENT_TX) { /* When appending the attribute fails, don't update its state * in order to retry at the next TX event. */ switch (mrp_tx_action_table[attr->state]) { case MRP_TX_ACTION_NONE: case MRP_TX_ACTION_S_JOIN_IN_OPTIONAL: case MRP_TX_ACTION_S_IN_OPTIONAL: break; case MRP_TX_ACTION_S_NEW: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_NEW) < 0) return; break; case MRP_TX_ACTION_S_JOIN_IN: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_JOIN_IN) < 0) return; break; case MRP_TX_ACTION_S_LV: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_LV) < 0) return; /* As a pure applicant, sending a leave message * implies that the attribute was unregistered and * can be destroyed. */ mrp_attr_destroy(app, attr); return; default: WARN_ON(1); } } attr->state = state; } int mrp_request_join(const struct net_device *dev, const struct mrp_application *appl, const void *value, u8 len, u8 type) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); struct mrp_attr *attr; if (sizeof(struct mrp_skb_cb) + len > sizeof_field(struct sk_buff, cb)) return -ENOMEM; spin_lock_bh(&app->lock); attr = mrp_attr_create(app, value, len, type); if (!attr) { spin_unlock_bh(&app->lock); return -ENOMEM; } mrp_attr_event(app, attr, MRP_EVENT_JOIN); spin_unlock_bh(&app->lock); return 0; } EXPORT_SYMBOL_GPL(mrp_request_join); void mrp_request_leave(const struct net_device *dev, const struct mrp_application *appl, const void *value, u8 len, u8 type) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); struct mrp_attr *attr; if (sizeof(struct mrp_skb_cb) + len > sizeof_field(struct sk_buff, cb)) return; spin_lock_bh(&app->lock); attr = mrp_attr_lookup(app, value, len, type); if (!attr) { spin_unlock_bh(&app->lock); return; } mrp_attr_event(app, attr, MRP_EVENT_LV); spin_unlock_bh(&app->lock); } EXPORT_SYMBOL_GPL(mrp_request_leave); static void mrp_mad_event(struct mrp_applicant *app, enum mrp_event event) { struct rb_node *node, *next; struct mrp_attr *attr; for (node = rb_first(&app->mad); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct mrp_attr, node); mrp_attr_event(app, attr, event); } } static void mrp_join_timer_arm(struct mrp_applicant *app) { unsigned long delay; delay = get_random_u32_below(msecs_to_jiffies(mrp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } static void mrp_join_timer(struct timer_list *t) { struct mrp_applicant *app = from_timer(app, t, join_timer); spin_lock(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); mrp_pdu_queue(app); spin_unlock(&app->lock); mrp_queue_xmit(app); spin_lock(&app->lock); if (likely(app->active)) mrp_join_timer_arm(app); spin_unlock(&app->lock); } static void mrp_periodic_timer_arm(struct mrp_applicant *app) { mod_timer(&app->periodic_timer, jiffies + msecs_to_jiffies(mrp_periodic_time)); } static void mrp_periodic_timer(struct timer_list *t) { struct mrp_applicant *app = from_timer(app, t, periodic_timer); spin_lock(&app->lock); if (likely(app->active)) { mrp_mad_event(app, MRP_EVENT_PERIODIC); mrp_pdu_queue(app); mrp_periodic_timer_arm(app); } spin_unlock(&app->lock); } static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset) { __be16 endmark; if (skb_copy_bits(skb, *offset, &endmark, sizeof(endmark)) < 0) return -1; if (endmark == MRP_END_MARK) { *offset += sizeof(endmark); return -1; } return 0; } static void mrp_pdu_parse_vecattr_event(struct mrp_applicant *app, struct sk_buff *skb, enum mrp_vecattr_event vaevent) { struct mrp_attr *attr; enum mrp_event event; attr = mrp_attr_lookup(app, mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen, mrp_cb(skb)->mh->attrtype); if (attr == NULL) return; switch (vaevent) { case MRP_VECATTR_EVENT_NEW: event = MRP_EVENT_R_NEW; break; case MRP_VECATTR_EVENT_JOIN_IN: event = MRP_EVENT_R_JOIN_IN; break; case MRP_VECATTR_EVENT_IN: event = MRP_EVENT_R_IN; break; case MRP_VECATTR_EVENT_JOIN_MT: event = MRP_EVENT_R_JOIN_MT; break; case MRP_VECATTR_EVENT_MT: event = MRP_EVENT_R_MT; break; case MRP_VECATTR_EVENT_LV: event = MRP_EVENT_R_LV; break; default: return; } mrp_attr_event(app, attr, event); } static int mrp_pdu_parse_vecattr(struct mrp_applicant *app, struct sk_buff *skb, int *offset) { struct mrp_vecattr_hdr _vah; u16 valen; u8 vaevents, vaevent; mrp_cb(skb)->vah = skb_header_pointer(skb, *offset, sizeof(_vah), &_vah); if (!mrp_cb(skb)->vah) return -1; *offset += sizeof(_vah); if (get_unaligned(&mrp_cb(skb)->vah->lenflags) & MRP_VECATTR_HDR_FLAG_LA) mrp_mad_event(app, MRP_EVENT_R_LA); valen = be16_to_cpu(get_unaligned(&mrp_cb(skb)->vah->lenflags) & MRP_VECATTR_HDR_LEN_MASK); /* The VectorAttribute structure in a PDU carries event information * about one or more attributes having consecutive values. Only the * value for the first attribute is contained in the structure. So * we make a copy of that value, and then increment it each time we * advance to the next event in its Vector. */ if (sizeof(struct mrp_skb_cb) + mrp_cb(skb)->mh->attrlen > sizeof_field(struct sk_buff, cb)) return -1; if (skb_copy_bits(skb, *offset, mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen) < 0) return -1; *offset += mrp_cb(skb)->mh->attrlen; /* In a VectorAttribute, the Vector contains events which are packed * three to a byte. We process one byte of the Vector at a time. */ while (valen > 0) { if (skb_copy_bits(skb, *offset, &vaevents, sizeof(vaevents)) < 0) return -1; *offset += sizeof(vaevents); /* Extract and process the first event. */ vaevent = vaevents / (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); if (vaevent >= __MRP_VECATTR_EVENT_MAX) { /* The byte is malformed; stop processing. */ return -1; } mrp_pdu_parse_vecattr_event(app, skb, vaevent); /* If present, extract and process the second event. */ if (!--valen) break; mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen); vaevents %= (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); vaevent = vaevents / __MRP_VECATTR_EVENT_MAX; mrp_pdu_parse_vecattr_event(app, skb, vaevent); /* If present, extract and process the third event. */ if (!--valen) break; mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen); vaevents %= __MRP_VECATTR_EVENT_MAX; vaevent = vaevents; mrp_pdu_parse_vecattr_event(app, skb, vaevent); } return 0; } static int mrp_pdu_parse_msg(struct mrp_applicant *app, struct sk_buff *skb, int *offset) { struct mrp_msg_hdr _mh; mrp_cb(skb)->mh = skb_header_pointer(skb, *offset, sizeof(_mh), &_mh); if (!mrp_cb(skb)->mh) return -1; *offset += sizeof(_mh); if (mrp_cb(skb)->mh->attrtype == 0 || mrp_cb(skb)->mh->attrtype > app->app->maxattr || mrp_cb(skb)->mh->attrlen == 0) return -1; while (skb->len > *offset) { if (mrp_pdu_parse_end_mark(skb, offset) < 0) break; if (mrp_pdu_parse_vecattr(app, skb, offset) < 0) return -1; } return 0; } static int mrp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct mrp_application *appl = container_of(pt, struct mrp_application, pkttype); struct mrp_port *port; struct mrp_applicant *app; struct mrp_pdu_hdr _ph; const struct mrp_pdu_hdr *ph; int offset = skb_network_offset(skb); /* If the interface is in promiscuous mode, drop the packet if * it was unicast to another host. */ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto out; port = rcu_dereference(dev->mrp_port); if (unlikely(!port)) goto out; app = rcu_dereference(port->applicants[appl->type]); if (unlikely(!app)) goto out; ph = skb_header_pointer(skb, offset, sizeof(_ph), &_ph); if (!ph) goto out; offset += sizeof(_ph); if (ph->version != app->app->version) goto out; spin_lock(&app->lock); while (skb->len > offset) { if (mrp_pdu_parse_end_mark(skb, &offset) < 0) break; if (mrp_pdu_parse_msg(app, skb, &offset) < 0) break; } spin_unlock(&app->lock); out: kfree_skb(skb); return 0; } static int mrp_init_port(struct net_device *dev) { struct mrp_port *port; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; rcu_assign_pointer(dev->mrp_port, port); return 0; } static void mrp_release_port(struct net_device *dev) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); unsigned int i; for (i = 0; i <= MRP_APPLICATION_MAX; i++) { if (rtnl_dereference(port->applicants[i])) return; } RCU_INIT_POINTER(dev->mrp_port, NULL); kfree_rcu(port, rcu); } int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl) { struct mrp_applicant *app; int err; ASSERT_RTNL(); if (!rtnl_dereference(dev->mrp_port)) { err = mrp_init_port(dev); if (err < 0) goto err1; } err = -ENOMEM; app = kzalloc(sizeof(*app), GFP_KERNEL); if (!app) goto err2; err = dev_mc_add(dev, appl->group_address); if (err < 0) goto err3; app->dev = dev; app->app = appl; app->mad = RB_ROOT; app->active = true; spin_lock_init(&app->lock); skb_queue_head_init(&app->queue); rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app); timer_setup(&app->join_timer, mrp_join_timer, 0); mrp_join_timer_arm(app); timer_setup(&app->periodic_timer, mrp_periodic_timer, 0); mrp_periodic_timer_arm(app); return 0; err3: kfree(app); err2: mrp_release_port(dev); err1: return err; } EXPORT_SYMBOL_GPL(mrp_init_applicant); void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); ASSERT_RTNL(); RCU_INIT_POINTER(port->applicants[appl->type], NULL); spin_lock_bh(&app->lock); app->active = false; spin_unlock_bh(&app->lock); /* Delete timer and generate a final TX event to flush out * all pending messages before the applicant is gone. */ timer_shutdown_sync(&app->join_timer); timer_shutdown_sync(&app->periodic_timer); spin_lock_bh(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); mrp_attr_destroy_all(app); mrp_pdu_queue(app); spin_unlock_bh(&app->lock); mrp_queue_xmit(app); dev_mc_del(dev, appl->group_address); kfree_rcu(app, rcu); mrp_release_port(dev); } EXPORT_SYMBOL_GPL(mrp_uninit_applicant); int mrp_register_application(struct mrp_application *appl) { appl->pkttype.func = mrp_rcv; dev_add_pack(&appl->pkttype); return 0; } EXPORT_SYMBOL_GPL(mrp_register_application); void mrp_unregister_application(struct mrp_application *appl) { dev_remove_pack(&appl->pkttype); } EXPORT_SYMBOL_GPL(mrp_unregister_application); |
148 129 103 150 66 66 148 150 149 90 81 128 35 146 148 125 34 103 35 150 4 29 128 129 130 129 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/static_key.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> #include <net/netfilter/nft_meta.h> #if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_X86) static struct static_key_false nf_tables_skip_direct_calls; static bool nf_skip_indirect_calls(void) { return static_branch_likely(&nf_tables_skip_direct_calls); } static void __init nf_skip_indirect_calls_enable(void) { if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) static_branch_enable(&nf_tables_skip_direct_calls); } #else static inline bool nf_skip_indirect_calls(void) { return false; } static inline void nf_skip_indirect_calls_enable(void) { } #endif static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, const struct nft_rule_dp *rule, struct nft_traceinfo *info, enum nft_trace_types type) { if (!info->trace || !info->nf_trace) return; info->type = type; nft_trace_notify(pkt, verdict, rule, info); } static inline void nft_trace_packet(const struct nft_pktinfo *pkt, struct nft_verdict *verdict, struct nft_traceinfo *info, const struct nft_rule_dp *rule, enum nft_trace_types type) { if (static_branch_unlikely(&nft_trace_enabled)) { info->nf_trace = pkt->skb->nf_trace; __nft_trace_packet(pkt, verdict, rule, info, type); } } static inline void nft_trace_copy_nftrace(const struct nft_pktinfo *pkt, struct nft_traceinfo *info) { if (static_branch_unlikely(&nft_trace_enabled)) info->nf_trace = pkt->skb->nf_trace; } static void nft_bitwise_fast_eval(const struct nft_expr *expr, struct nft_regs *regs) { const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); u32 *src = ®s->data[priv->sreg]; u32 *dst = ®s->data[priv->dreg]; *dst = (*src & priv->mask) ^ priv->xor; } static void nft_cmp_fast_eval(const struct nft_expr *expr, struct nft_regs *regs) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); if (((regs->data[priv->sreg] & priv->mask) == priv->data) ^ priv->inv) return; regs->verdict.code = NFT_BREAK; } static void nft_cmp16_fast_eval(const struct nft_expr *expr, struct nft_regs *regs) { const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); const u64 *reg_data = (const u64 *)®s->data[priv->sreg]; const u64 *mask = (const u64 *)&priv->mask; const u64 *data = (const u64 *)&priv->data; if (((reg_data[0] & mask[0]) == data[0] && ((reg_data[1] & mask[1]) == data[1])) ^ priv->inv) return; regs->verdict.code = NFT_BREAK; } static noinline void __nft_trace_verdict(const struct nft_pktinfo *pkt, struct nft_traceinfo *info, const struct nft_rule_dp *rule, const struct nft_regs *regs) { enum nft_trace_types type; switch (regs->verdict.code & NF_VERDICT_MASK) { case NFT_CONTINUE: case NFT_RETURN: type = NFT_TRACETYPE_RETURN; break; case NF_STOLEN: type = NFT_TRACETYPE_RULE; /* can't access skb->nf_trace; use copy */ break; default: type = NFT_TRACETYPE_RULE; if (info->trace) info->nf_trace = pkt->skb->nf_trace; break; } __nft_trace_packet(pkt, ®s->verdict, rule, info, type); } static inline void nft_trace_verdict(const struct nft_pktinfo *pkt, struct nft_traceinfo *info, const struct nft_rule_dp *rule, const struct nft_regs *regs) { if (static_branch_unlikely(&nft_trace_enabled)) __nft_trace_verdict(pkt, info, rule, regs); } static bool nft_payload_fast_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_payload *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; u32 *dest = ®s->data[priv->dreg]; unsigned char *ptr; if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) ptr = skb_network_header(skb); else { if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) return false; ptr = skb->data + nft_thoff(pkt); } ptr += priv->offset; if (unlikely(ptr + priv->len > skb_tail_pointer(skb))) return false; *dest = 0; if (priv->len == 2) *(u16 *)dest = *(u16 *)ptr; else if (priv->len == 4) *(u32 *)dest = *(u32 *)ptr; else *(u8 *)dest = *(u8 *)ptr; return true; } DEFINE_STATIC_KEY_FALSE(nft_counters_enabled); static noinline void nft_update_chain_stats(const struct nft_chain *chain, const struct nft_pktinfo *pkt) { struct nft_base_chain *base_chain; struct nft_stats __percpu *pstats; struct nft_stats *stats; base_chain = nft_base_chain(chain); pstats = READ_ONCE(base_chain->stats); if (pstats) { local_bh_disable(); stats = this_cpu_ptr(pstats); u64_stats_update_begin(&stats->syncp); stats->pkts++; stats->bytes += pkt->skb->len; u64_stats_update_end(&stats->syncp); local_bh_enable(); } } struct nft_jumpstack { const struct nft_rule_dp *rule; }; static void expr_call_ops_eval(const struct nft_expr *expr, struct nft_regs *regs, struct nft_pktinfo *pkt) { #ifdef CONFIG_MITIGATION_RETPOLINE unsigned long e; if (nf_skip_indirect_calls()) goto indirect_call; e = (unsigned long)expr->ops->eval; #define X(e, fun) \ do { if ((e) == (unsigned long)(fun)) \ return fun(expr, regs, pkt); } while (0) X(e, nft_payload_eval); X(e, nft_cmp_eval); X(e, nft_counter_eval); X(e, nft_meta_get_eval); X(e, nft_lookup_eval); #if IS_ENABLED(CONFIG_NFT_CT) X(e, nft_ct_get_fast_eval); #endif X(e, nft_range_eval); X(e, nft_immediate_eval); X(e, nft_byteorder_eval); X(e, nft_dynset_eval); X(e, nft_rt_get_eval); X(e, nft_bitwise_eval); X(e, nft_objref_eval); X(e, nft_objref_map_eval); #undef X indirect_call: #endif /* CONFIG_MITIGATION_RETPOLINE */ expr->ops->eval(expr, regs, pkt); } #define nft_rule_expr_first(rule) (struct nft_expr *)&rule->data[0] #define nft_rule_expr_next(expr) ((void *)expr) + expr->ops->size #define nft_rule_expr_last(rule) (struct nft_expr *)&rule->data[rule->dlen] #define nft_rule_dp_for_each_expr(expr, last, rule) \ for ((expr) = nft_rule_expr_first(rule), (last) = nft_rule_expr_last(rule); \ (expr) != (last); \ (expr) = nft_rule_expr_next(expr)) unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv) { const struct nft_chain *chain = priv, *basechain = chain; const struct net *net = nft_net(pkt); const struct nft_expr *expr, *last; const struct nft_rule_dp *rule; struct nft_regs regs; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; bool genbit = READ_ONCE(net->nft.gencursor); struct nft_rule_blob *blob; struct nft_traceinfo info; info.trace = false; if (static_branch_unlikely(&nft_trace_enabled)) nft_trace_init(&info, pkt, basechain); do_chain: if (genbit) blob = rcu_dereference(chain->blob_gen_1); else blob = rcu_dereference(chain->blob_gen_0); rule = (struct nft_rule_dp *)blob->data; next_rule: regs.verdict.code = NFT_CONTINUE; for (; !rule->is_last ; rule = nft_rule_next(rule)) { nft_rule_dp_for_each_expr(expr, last, rule) { if (expr->ops == &nft_cmp_fast_ops) nft_cmp_fast_eval(expr, ®s); else if (expr->ops == &nft_cmp16_fast_ops) nft_cmp16_fast_eval(expr, ®s); else if (expr->ops == &nft_bitwise_fast_ops) nft_bitwise_fast_eval(expr, ®s); else if (expr->ops != &nft_payload_fast_ops || !nft_payload_fast_eval(expr, ®s, pkt)) expr_call_ops_eval(expr, ®s, pkt); if (regs.verdict.code != NFT_CONTINUE) break; } switch (regs.verdict.code) { case NFT_BREAK: regs.verdict.code = NFT_CONTINUE; nft_trace_copy_nftrace(pkt, &info); continue; case NFT_CONTINUE: nft_trace_packet(pkt, ®s.verdict, &info, rule, NFT_TRACETYPE_RULE); continue; } break; } nft_trace_verdict(pkt, &info, rule, ®s); switch (regs.verdict.code & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_QUEUE: case NF_STOLEN: return regs.verdict.code; case NF_DROP: return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM); } switch (regs.verdict.code) { case NFT_JUMP: if (WARN_ON_ONCE(stackptr >= NFT_JUMP_STACK_SIZE)) return NF_DROP; jumpstack[stackptr].rule = nft_rule_next(rule); stackptr++; fallthrough; case NFT_GOTO: chain = regs.verdict.chain; goto do_chain; case NFT_CONTINUE: case NFT_RETURN: break; default: WARN_ON_ONCE(1); } if (stackptr > 0) { stackptr--; rule = jumpstack[stackptr].rule; goto next_rule; } nft_trace_packet(pkt, ®s.verdict, &info, NULL, NFT_TRACETYPE_POLICY); if (static_branch_unlikely(&nft_counters_enabled)) nft_update_chain_stats(basechain, pkt); if (nft_base_chain(basechain)->policy == NF_DROP) return NF_DROP_REASON(pkt->skb, SKB_DROP_REASON_NETFILTER_DROP, EPERM); return nft_base_chain(basechain)->policy; } EXPORT_SYMBOL_GPL(nft_do_chain); static struct nft_expr_type *nft_basic_types[] = { &nft_imm_type, &nft_cmp_type, &nft_lookup_type, &nft_bitwise_type, &nft_byteorder_type, &nft_payload_type, &nft_dynset_type, &nft_range_type, &nft_meta_type, &nft_rt_type, &nft_exthdr_type, &nft_last_type, &nft_counter_type, &nft_objref_type, &nft_inner_type, }; static struct nft_object_type *nft_basic_objects[] = { #ifdef CONFIG_NETWORK_SECMARK &nft_secmark_obj_type, #endif &nft_counter_obj_type, }; int __init nf_tables_core_module_init(void) { int err, i, j = 0; nft_counter_init_seqcount(); for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) { err = nft_register_obj(nft_basic_objects[i]); if (err) goto err; } for (j = 0; j < ARRAY_SIZE(nft_basic_types); j++) { err = nft_register_expr(nft_basic_types[j]); if (err) goto err; } nf_skip_indirect_calls_enable(); return 0; err: while (j-- > 0) nft_unregister_expr(nft_basic_types[j]); while (i-- > 0) nft_unregister_obj(nft_basic_objects[i]); return err; } void nf_tables_core_module_exit(void) { int i; i = ARRAY_SIZE(nft_basic_types); while (i-- > 0) nft_unregister_expr(nft_basic_types[i]); i = ARRAY_SIZE(nft_basic_objects); while (i-- > 0) nft_unregister_obj(nft_basic_objects[i]); } |
41 41 41 3 3 3 3 1 2 3 9 9 3 6 9 9 9 9 35 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2021-2024 Intel Corporation */ #include <linux/export.h> #include <linux/etherdevice.h> #include <net/mac80211.h> #include <linux/unaligned.h> #include "ieee80211_i.h" #include "rate.h" #include "mesh.h" #include "led.h" #include "wme.h" void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); int tmp; skb->pkt_type = IEEE80211_TX_STATUS_MSG; skb_queue_tail(info->flags & IEEE80211_TX_CTL_REQ_TX_STATUS ? &local->skb_queue : &local->skb_queue_unreliable, skb); tmp = skb_queue_len(&local->skb_queue) + skb_queue_len(&local->skb_queue_unreliable); while (tmp > IEEE80211_IRQSAFE_QUEUE_LIMIT && (skb = skb_dequeue(&local->skb_queue_unreliable))) { ieee80211_free_txskb(hw, skb); tmp--; I802_DEBUG_INC(local->tx_status_drop); } tasklet_schedule(&local->tasklet); } EXPORT_SYMBOL(ieee80211_tx_status_irqsafe); static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *)skb->data; int ac; if (info->flags & (IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_CTL_AMPDU | IEEE80211_TX_CTL_HW_80211_ENCAP)) { ieee80211_free_txskb(&local->hw, skb); return; } /* * This skb 'survived' a round-trip through the driver, and * hopefully the driver didn't mangle it too badly. However, * we can definitely not rely on the control information * being correct. Clear it so we don't get junk there, and * indicate that it needs new processing, but must not be * modified/encrypted again. */ memset(&info->control, 0, sizeof(info->control)); info->control.jiffies = jiffies; info->control.vif = &sta->sdata->vif; info->control.flags |= IEEE80211_TX_INTCFL_NEED_TXPROCESSING; info->flags |= IEEE80211_TX_INTFL_RETRANSMISSION; info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; sta->deflink.status_stats.filtered++; /* * Clear more-data bit on filtered frames, it might be set * but later frames might time out so it might have to be * clear again ... It's all rather unlikely (this frame * should time out first, right?) but let's not confuse * peers unnecessarily. */ if (hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_MOREDATA)) hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control)) { u8 *p = ieee80211_get_qos_ctl(hdr); int tid = *p & IEEE80211_QOS_CTL_TID_MASK; /* * Clear EOSP if set, this could happen e.g. * if an absence period (us being a P2P GO) * shortens the SP. */ if (*p & IEEE80211_QOS_CTL_EOSP) *p &= ~IEEE80211_QOS_CTL_EOSP; ac = ieee80211_ac_from_tid(tid); } else { ac = IEEE80211_AC_BE; } /* * Clear the TX filter mask for this STA when sending the next * packet. If the STA went to power save mode, this will happen * when it wakes up for the next time. */ set_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT); ieee80211_clear_fast_xmit(sta); /* * This code races in the following way: * * (1) STA sends frame indicating it will go to sleep and does so * (2) hardware/firmware adds STA to filter list, passes frame up * (3) hardware/firmware processes TX fifo and suppresses a frame * (4) we get TX status before having processed the frame and * knowing that the STA has gone to sleep. * * This is actually quite unlikely even when both those events are * processed from interrupts coming in quickly after one another or * even at the same time because we queue both TX status events and * RX frames to be processed by a tasklet and process them in the * same order that they were received or TX status last. Hence, there * is no race as long as the frame RX is processed before the next TX * status, which drivers can ensure, see below. * * Note that this can only happen if the hardware or firmware can * actually add STAs to the filter list, if this is done by the * driver in response to set_tim() (which will only reduce the race * this whole filtering tries to solve, not completely solve it) * this situation cannot happen. * * To completely solve this race drivers need to make sure that they * (a) don't mix the irq-safe/not irq-safe TX status/RX processing * functions and * (b) always process RX events before TX status events if ordering * can be unknown, for example with different interrupt status * bits. * (c) if PS mode transitions are manual (i.e. the flag * %IEEE80211_HW_AP_LINK_PS is set), always process PS state * changes before calling TX status events if ordering can be * unknown. */ if (test_sta_flag(sta, WLAN_STA_PS_STA) && skb_queue_len(&sta->tx_filtered[ac]) < STA_MAX_TX_BUFFER) { skb_queue_tail(&sta->tx_filtered[ac], skb); sta_info_recalc_tim(sta); if (!timer_pending(&local->sta_cleanup)) mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); return; } if (!test_sta_flag(sta, WLAN_STA_PS_STA) && !(info->flags & IEEE80211_TX_INTFL_RETRIED)) { /* Software retry the packet once */ info->flags |= IEEE80211_TX_INTFL_RETRIED; ieee80211_add_pending_skb(local, skb); return; } ps_dbg_ratelimited(sta->sdata, "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n", skb_queue_len(&sta->tx_filtered[ac]), !!test_sta_flag(sta, WLAN_STA_PS_STA), jiffies); ieee80211_free_txskb(&local->hw, skb); } static void ieee80211_check_pending_bar(struct sta_info *sta, u8 *addr, u8 tid) { struct tid_ampdu_tx *tid_tx; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx || !tid_tx->bar_pending) return; tid_tx->bar_pending = false; ieee80211_send_bar(&sta->sdata->vif, addr, tid, tid_tx->failed_bar_ssn); } static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *) skb->data; if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qc = ieee80211_get_qos_ctl(hdr); u16 tid = qc[0] & 0xf; ieee80211_check_pending_bar(sta, hdr->addr1, tid); } } static void ieee80211_set_bar_pending(struct sta_info *sta, u8 tid, u16 ssn) { struct tid_ampdu_tx *tid_tx; tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); if (!tid_tx) return; tid_tx->failed_bar_ssn = ssn; tid_tx->bar_pending = true; } static int ieee80211_tx_radiotap_len(struct ieee80211_tx_info *info, struct ieee80211_tx_status *status) { struct ieee80211_rate_status *status_rate = NULL; int len = sizeof(struct ieee80211_radiotap_header); if (status && status->n_rates) status_rate = &status->rates[status->n_rates - 1]; /* IEEE80211_RADIOTAP_RATE rate */ if (status_rate && !(status_rate->rate_idx.flags & (RATE_INFO_FLAGS_MCS | RATE_INFO_FLAGS_DMG | RATE_INFO_FLAGS_EDMG | RATE_INFO_FLAGS_VHT_MCS | RATE_INFO_FLAGS_HE_MCS))) len += 2; else if (info->status.rates[0].idx >= 0 && !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) len += 2; /* IEEE80211_RADIOTAP_TX_FLAGS */ len += 2; /* IEEE80211_RADIOTAP_DATA_RETRIES */ len += 1; /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (status_rate) { if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_MCS) len += 3; else if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_VHT_MCS) len = ALIGN(len, 2) + 12; else if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_HE_MCS) len = ALIGN(len, 2) + 12; } else if (info->status.rates[0].idx >= 0) { if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) len += 3; else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) len = ALIGN(len, 2) + 12; } return len; } static void ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, struct sk_buff *skb, int retry_count, int rtap_len, struct ieee80211_tx_status *status) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_radiotap_header *rthdr; struct ieee80211_rate_status *status_rate = NULL; unsigned char *pos; u16 legacy_rate = 0; u16 txflags; if (status && status->n_rates) status_rate = &status->rates[status->n_rates - 1]; rthdr = skb_push(skb, rtap_len); memset(rthdr, 0, rtap_len); rthdr->it_len = cpu_to_le16(rtap_len); rthdr->it_present = cpu_to_le32(BIT(IEEE80211_RADIOTAP_TX_FLAGS) | BIT(IEEE80211_RADIOTAP_DATA_RETRIES)); pos = (unsigned char *)(rthdr + 1); /* * XXX: Once radiotap gets the bitmap reset thing the vendor * extensions proposal contains, we can actually report * the whole set of tries we did. */ /* IEEE80211_RADIOTAP_RATE */ if (status_rate) { if (!(status_rate->rate_idx.flags & (RATE_INFO_FLAGS_MCS | RATE_INFO_FLAGS_DMG | RATE_INFO_FLAGS_EDMG | RATE_INFO_FLAGS_VHT_MCS | RATE_INFO_FLAGS_HE_MCS))) legacy_rate = status_rate->rate_idx.legacy; } else if (info->status.rates[0].idx >= 0 && !(info->status.rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[info->band]; legacy_rate = sband->bitrates[info->status.rates[0].idx].bitrate; } if (legacy_rate) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_RATE)); *pos = DIV_ROUND_UP(legacy_rate, 5); /* padding for tx flags */ pos += 2; } /* IEEE80211_RADIOTAP_TX_FLAGS */ txflags = 0; if (!(info->flags & IEEE80211_TX_STAT_ACK) && !is_multicast_ether_addr(hdr->addr1)) txflags |= IEEE80211_RADIOTAP_F_TX_FAIL; if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_CTS_PROTECT) txflags |= IEEE80211_RADIOTAP_F_TX_CTS; if (info->status.rates[0].flags & IEEE80211_TX_RC_USE_RTS_CTS) txflags |= IEEE80211_RADIOTAP_F_TX_RTS; put_unaligned_le16(txflags, pos); pos += 2; /* IEEE80211_RADIOTAP_DATA_RETRIES */ /* for now report the total retry_count */ *pos = retry_count; pos++; if (status_rate && (status_rate->rate_idx.flags & RATE_INFO_FLAGS_MCS)) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_SHORT_GI) pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; if (status_rate->rate_idx.bw == RATE_INFO_BW_40) pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; pos[2] = status_rate->rate_idx.mcs; pos += 3; } else if (status_rate && (status_rate->rate_idx.flags & RATE_INFO_FLAGS_VHT_MCS)) { u16 known = local->hw.radiotap_vht_details & (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ put_unaligned_le16(known, pos); pos += 2; /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ if (status_rate->rate_idx.flags & RATE_INFO_FLAGS_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; pos++; /* u8 bandwidth */ switch (status_rate->rate_idx.bw) { case RATE_INFO_BW_160: *pos = 11; break; case RATE_INFO_BW_80: *pos = 4; break; case RATE_INFO_BW_40: *pos = 1; break; default: *pos = 0; break; } pos++; /* u8 mcs_nss[4] */ *pos = (status_rate->rate_idx.mcs << 4) | status_rate->rate_idx.nss; pos += 4; /* u8 coding */ pos++; /* u8 group_id */ pos++; /* u16 partial_aid */ pos += 2; } else if (status_rate && (status_rate->rate_idx.flags & RATE_INFO_FLAGS_HE_MCS)) { struct ieee80211_radiotap_he *he; rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); he = (struct ieee80211_radiotap_he *)pos; he->data1 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA1_FORMAT_SU | IEEE80211_RADIOTAP_HE_DATA1_DATA_MCS_KNOWN | IEEE80211_RADIOTAP_HE_DATA1_DATA_DCM_KNOWN | IEEE80211_RADIOTAP_HE_DATA1_BW_RU_ALLOC_KNOWN); he->data2 = cpu_to_le16(IEEE80211_RADIOTAP_HE_DATA2_GI_KNOWN); #define HE_PREP(f, val) le16_encode_bits(val, IEEE80211_RADIOTAP_HE_##f) he->data6 |= HE_PREP(DATA6_NSTS, status_rate->rate_idx.nss); #define CHECK_GI(s) \ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_GI_##s != \ (int)NL80211_RATE_INFO_HE_GI_##s) CHECK_GI(0_8); CHECK_GI(1_6); CHECK_GI(3_2); he->data3 |= HE_PREP(DATA3_DATA_MCS, status_rate->rate_idx.mcs); he->data3 |= HE_PREP(DATA3_DATA_DCM, status_rate->rate_idx.he_dcm); he->data5 |= HE_PREP(DATA5_GI, status_rate->rate_idx.he_gi); switch (status_rate->rate_idx.bw) { case RATE_INFO_BW_20: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_20MHZ); break; case RATE_INFO_BW_40: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_40MHZ); break; case RATE_INFO_BW_80: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_80MHZ); break; case RATE_INFO_BW_160: he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_160MHZ); break; case RATE_INFO_BW_HE_RU: #define CHECK_RU_ALLOC(s) \ BUILD_BUG_ON(IEEE80211_RADIOTAP_HE_DATA5_DATA_BW_RU_ALLOC_##s##T != \ NL80211_RATE_INFO_HE_RU_ALLOC_##s + 4) CHECK_RU_ALLOC(26); CHECK_RU_ALLOC(52); CHECK_RU_ALLOC(106); CHECK_RU_ALLOC(242); CHECK_RU_ALLOC(484); CHECK_RU_ALLOC(996); CHECK_RU_ALLOC(2x996); he->data5 |= HE_PREP(DATA5_DATA_BW_RU_ALLOC, status_rate->rate_idx.he_ru_alloc + 4); break; default: WARN_ONCE(1, "Invalid SU BW %d\n", status_rate->rate_idx.bw); } pos += sizeof(struct ieee80211_radiotap_he); } if (status_rate || info->status.rates[0].idx < 0) return; /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) { rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) pos[1] |= IEEE80211_RADIOTAP_MCS_SGI; if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) pos[1] |= IEEE80211_RADIOTAP_MCS_BW_40; if (info->status.rates[0].flags & IEEE80211_TX_RC_GREEN_FIELD) pos[1] |= IEEE80211_RADIOTAP_MCS_FMT_GF; pos[2] = info->status.rates[0].idx; pos += 3; } else if (info->status.rates[0].flags & IEEE80211_TX_RC_VHT_MCS) { u16 known = local->hw.radiotap_vht_details & (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); /* u16 known - IEEE80211_RADIOTAP_VHT_KNOWN_* */ put_unaligned_le16(known, pos); pos += 2; /* u8 flags - IEEE80211_RADIOTAP_VHT_FLAG_* */ if (info->status.rates[0].flags & IEEE80211_TX_RC_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; pos++; /* u8 bandwidth */ if (info->status.rates[0].flags & IEEE80211_TX_RC_40_MHZ_WIDTH) *pos = 1; else if (info->status.rates[0].flags & IEEE80211_TX_RC_80_MHZ_WIDTH) *pos = 4; else if (info->status.rates[0].flags & IEEE80211_TX_RC_160_MHZ_WIDTH) *pos = 11; else /* IEEE80211_TX_RC_{20_MHZ_WIDTH,FIXME:DUP_DATA} */ *pos = 0; pos++; /* u8 mcs_nss[4] */ *pos = (ieee80211_rate_get_vht_mcs(&info->status.rates[0]) << 4) | ieee80211_rate_get_vht_nss(&info->status.rates[0]); pos += 4; /* u8 coding */ pos++; /* u8 group_id */ pos++; /* u16 partial_aid */ pos += 2; } } /* * Handles the tx for TDLS teardown frames. * If the frame wasn't ACKed by the peer - it will be re-sent through the AP */ static void ieee80211_tdls_td_tx_handle(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 flags) { struct sk_buff *teardown_skb; struct sk_buff *orig_teardown_skb; bool is_teardown = false; /* Get the teardown data we need and free the lock */ spin_lock(&sdata->u.mgd.teardown_lock); teardown_skb = sdata->u.mgd.teardown_skb; orig_teardown_skb = sdata->u.mgd.orig_teardown_skb; if ((skb == orig_teardown_skb) && teardown_skb) { sdata->u.mgd.teardown_skb = NULL; sdata->u.mgd.orig_teardown_skb = NULL; is_teardown = true; } spin_unlock(&sdata->u.mgd.teardown_lock); if (is_teardown) { /* This mechanism relies on being able to get ACKs */ WARN_ON(!ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)); /* Check if peer has ACKed */ if (flags & IEEE80211_TX_STAT_ACK) { dev_kfree_skb_any(teardown_skb); } else { tdls_dbg(sdata, "TDLS Resending teardown through AP\n"); ieee80211_subif_start_xmit(teardown_skb, skb->dev); } } } static struct ieee80211_sub_if_data * ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata; if (skb->dev) { list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; if (skb->dev == sdata->dev) return sdata; } return NULL; } return rcu_dereference(local->p2p_sdata); } static void ieee80211_report_ack_skb(struct ieee80211_local *local, struct sk_buff *orig_skb, bool acked, bool dropped, ktime_t ack_hwtstamp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(orig_skb); struct sk_buff *skb; unsigned long flags; spin_lock_irqsave(&local->ack_status_lock, flags); skb = idr_remove(&local->ack_status_frames, info->status_data); spin_unlock_irqrestore(&local->ack_status_lock, flags); if (!skb) return; if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) { u64 cookie = IEEE80211_SKB_CB(skb)->ack.cookie; struct ieee80211_sub_if_data *sdata; struct ieee80211_hdr *hdr = (void *)skb->data; bool is_valid_ack_signal = !!(info->status.flags & IEEE80211_TX_STATUS_ACK_SIGNAL_VALID); struct cfg80211_tx_status status = { .cookie = cookie, .buf = skb->data, .len = skb->len, .ack = acked, }; if (ieee80211_is_timing_measurement(orig_skb) || ieee80211_is_ftm(orig_skb)) { status.tx_tstamp = ktime_to_ns(skb_hwtstamps(orig_skb)->hwtstamp); status.ack_tstamp = ktime_to_ns(ack_hwtstamp); } rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (sdata) { if (skb->protocol == sdata->control_port_protocol || skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) cfg80211_control_port_tx_status(&sdata->wdev, cookie, skb->data, skb->len, acked, GFP_ATOMIC); else if (ieee80211_is_any_nullfunc(hdr->frame_control)) cfg80211_probe_status(sdata->dev, hdr->addr1, cookie, acked, info->status.ack_signal, is_valid_ack_signal, GFP_ATOMIC); else if (ieee80211_is_mgmt(hdr->frame_control)) cfg80211_mgmt_tx_status_ext(&sdata->wdev, &status, GFP_ATOMIC); else pr_warn("Unknown status report in ack skb\n"); } rcu_read_unlock(); dev_kfree_skb_any(skb); } else if (dropped) { dev_kfree_skb_any(skb); } else { /* consumes skb */ skb_complete_wifi_ack(skb, acked); } } static void ieee80211_handle_smps_status(struct ieee80211_sub_if_data *sdata, bool acked, u16 status_data) { u16 sub_data = u16_get_bits(status_data, IEEE80211_STATUS_SUBDATA_MASK); enum ieee80211_smps_mode smps_mode = sub_data & 3; int link_id = (sub_data >> 2); struct ieee80211_link_data *link; if (!sdata || !ieee80211_sdata_running(sdata)) return; if (!acked) return; if (sdata->vif.type != NL80211_IFTYPE_STATION) return; if (WARN(link_id >= ARRAY_SIZE(sdata->link), "bad SMPS status link: %d\n", link_id)) return; link = rcu_dereference(sdata->link[link_id]); if (!link) return; /* * This update looks racy, but isn't, the only other place * updating this variable is in managed mode before assoc, * and we have to be associated to have a status from the * action frame TX, since we cannot send it while we're not * associated yet. */ link->smps_mode = smps_mode; wiphy_work_queue(sdata->local->hw.wiphy, &link->u.mgd.recalc_smps); } static void ieee80211_handle_teardown_ttlm_status(struct ieee80211_sub_if_data *sdata, bool acked) { if (!sdata || !ieee80211_sdata_running(sdata)) return; if (!acked) return; if (sdata->vif.type != NL80211_IFTYPE_STATION) return; wiphy_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.teardown_ttlm_work); } static void ieee80211_report_used_skb(struct ieee80211_local *local, struct sk_buff *skb, bool dropped, ktime_t ack_hwtstamp) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 tx_time_est = ieee80211_info_get_tx_time_est(info); struct ieee80211_hdr *hdr = (void *)skb->data; bool acked = info->flags & IEEE80211_TX_STAT_ACK; if (dropped) acked = false; if (tx_time_est) { struct sta_info *sta; rcu_read_lock(); sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); ieee80211_sta_update_pending_airtime(local, sta, skb_get_queue_mapping(skb), tx_time_est, true); rcu_read_unlock(); } if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { struct ieee80211_sub_if_data *sdata; rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); if (!sdata) { skb->dev = NULL; } else if (!dropped) { /* Check to see if packet is a TDLS teardown packet */ if (ieee80211_is_data(hdr->frame_control) && (ieee80211_get_tdls_action(skb) == WLAN_TDLS_TEARDOWN)) { ieee80211_tdls_td_tx_handle(local, sdata, skb, info->flags); } else if (ieee80211_s1g_is_twt_setup(skb)) { if (!acked) { struct sk_buff *qskb; qskb = skb_clone(skb, GFP_ATOMIC); if (qskb) { skb_queue_tail(&sdata->status_queue, qskb); wiphy_work_queue(local->hw.wiphy, &sdata->work); } } } else { ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, acked); } } rcu_read_unlock(); } else if (info->status_data_idr) { ieee80211_report_ack_skb(local, skb, acked, dropped, ack_hwtstamp); } else if (info->status_data) { struct ieee80211_sub_if_data *sdata; rcu_read_lock(); sdata = ieee80211_sdata_from_skb(local, skb); switch (u16_get_bits(info->status_data, IEEE80211_STATUS_TYPE_MASK)) { case IEEE80211_STATUS_TYPE_SMPS: ieee80211_handle_smps_status(sdata, acked, info->status_data); break; case IEEE80211_STATUS_TYPE_NEG_TTLM: ieee80211_handle_teardown_ttlm_status(sdata, acked); break; } rcu_read_unlock(); } if (!dropped && skb->destructor) { skb->wifi_acked_valid = 1; skb->wifi_acked = acked; } ieee80211_led_tx(local); if (skb_has_frag_list(skb)) { kfree_skb_list(skb_shinfo(skb)->frag_list); skb_shinfo(skb)->frag_list = NULL; } } /* * Use a static threshold for now, best value to be determined * by testing ... * Should it depend on: * - on # of retransmissions * - current throughput (higher value for higher tpt)? */ #define STA_LOST_PKT_THRESHOLD 50 #define STA_LOST_PKT_TIME HZ /* 1 sec since last ACK */ #define STA_LOST_TDLS_PKT_TIME (10*HZ) /* 10secs since last ACK */ static void ieee80211_lost_packet(struct sta_info *sta, struct ieee80211_tx_info *info) { unsigned long pkt_time = STA_LOST_PKT_TIME; unsigned int pkt_thr = STA_LOST_PKT_THRESHOLD; /* If driver relies on its own algorithm for station kickout, skip * mac80211 packet loss mechanism. */ if (ieee80211_hw_check(&sta->local->hw, REPORTS_LOW_ACK)) return; /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; sta->deflink.status_stats.lost_packets++; if (sta->sta.tdls) { pkt_time = STA_LOST_TDLS_PKT_TIME; pkt_thr = STA_LOST_PKT_THRESHOLD; } /* * If we're in TDLS mode, make sure that all STA_LOST_PKT_THRESHOLD * of the last packets were lost, and that no ACK was received in the * last STA_LOST_TDLS_PKT_TIME ms, before triggering the CQM packet-loss * mechanism. * For non-TDLS, use STA_LOST_PKT_THRESHOLD and STA_LOST_PKT_TIME */ if (sta->deflink.status_stats.lost_packets < pkt_thr || !time_after(jiffies, sta->deflink.status_stats.last_pkt_time + pkt_time)) return; cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, sta->deflink.status_stats.lost_packets, GFP_ATOMIC); sta->deflink.status_stats.lost_packets = 0; } static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, int *retry_count) { int count = -1; int i; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) { /* just the first aggr frame carry status info */ info->status.rates[i].idx = -1; info->status.rates[i].count = 0; break; } else if (info->status.rates[i].idx < 0) { break; } else if (i >= hw->max_report_rates) { /* the HW cannot have attempted that rate */ info->status.rates[i].idx = -1; info->status.rates[i].count = 0; break; } count += info->status.rates[i].count; } if (count < 0) count = 0; *retry_count = count; return i - 1; } void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, int retry_count, bool send_to_cooked, struct ieee80211_tx_status *status) { struct sk_buff *skb2; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_sub_if_data *sdata; struct net_device *prev_dev = NULL; int rtap_len; /* send frame to monitor interfaces now */ rtap_len = ieee80211_tx_radiotap_len(info, status); if (WARN_ON_ONCE(skb_headroom(skb) < rtap_len)) { pr_err("ieee80211_tx_status: headroom too small\n"); dev_kfree_skb(skb); return; } ieee80211_add_tx_radiotap_header(local, skb, retry_count, rtap_len, status); /* XXX: is this sufficient for BPF? */ skb_reset_mac_header(skb); skb->ip_summed = CHECKSUM_UNNECESSARY; skb->pkt_type = PACKET_OTHERHOST; skb->protocol = htons(ETH_P_802_2); memset(skb->cb, 0, sizeof(skb->cb)); rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { if (!ieee80211_sdata_running(sdata)) continue; if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) && !send_to_cooked) continue; if (prev_dev) { skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) { skb2->dev = prev_dev; netif_rx(skb2); } } prev_dev = sdata->dev; } } if (prev_dev) { skb->dev = prev_dev; netif_rx(skb); skb = NULL; } rcu_read_unlock(); dev_kfree_skb(skb); } static void __ieee80211_tx_status(struct ieee80211_hw *hw, struct ieee80211_tx_status *status, int rates_idx, int retry_count) { struct sk_buff *skb = status->skb; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct sta_info *sta; __le16 fc; bool send_to_cooked; bool acked; bool noack_success; struct ieee80211_bar *bar; int tid = IEEE80211_NUM_TIDS; fc = hdr->frame_control; if (status->sta) { sta = container_of(status->sta, struct sta_info, sta); if (info->flags & IEEE80211_TX_STATUS_EOSP) clear_sta_flag(sta, WLAN_STA_SP); acked = !!(info->flags & IEEE80211_TX_STAT_ACK); noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); /* mesh Peer Service Period support */ if (ieee80211_vif_is_mesh(&sta->sdata->vif) && ieee80211_is_data_qos(fc)) ieee80211_mpsp_trigger_process( ieee80211_get_qos_ctl(hdr), sta, true, acked); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL) && (ieee80211_is_data(hdr->frame_control)) && (rates_idx != -1)) sta->deflink.tx_stats.last_rate = info->status.rates[rates_idx]; if ((info->flags & IEEE80211_TX_STAT_AMPDU_NO_BACK) && (ieee80211_is_data_qos(fc))) { u16 ssn; u8 *qc; qc = ieee80211_get_qos_ctl(hdr); tid = qc[0] & 0xf; ssn = ((le16_to_cpu(hdr->seq_ctrl) + 0x10) & IEEE80211_SCTL_SEQ); ieee80211_send_bar(&sta->sdata->vif, hdr->addr1, tid, ssn); } else if (ieee80211_is_data_qos(fc)) { u8 *qc = ieee80211_get_qos_ctl(hdr); tid = qc[0] & 0xf; } if (!acked && ieee80211_is_back_req(fc)) { u16 control; /* * BAR failed, store the last SSN and retry sending * the BAR when the next unicast transmission on the * same TID succeeds. */ bar = (struct ieee80211_bar *) skb->data; control = le16_to_cpu(bar->control); if (!(control & IEEE80211_BAR_CTRL_MULTI_TID)) { u16 ssn = le16_to_cpu(bar->start_seq_num); tid = (control & IEEE80211_BAR_CTRL_TID_INFO_MASK) >> IEEE80211_BAR_CTRL_TID_INFO_SHIFT; ieee80211_set_bar_pending(sta, tid, ssn); } } if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) { ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (ieee80211_is_data_present(fc)) { if (!acked && !noack_success) sta->deflink.status_stats.msdu_failed[tid]++; sta->deflink.status_stats.msdu_retries[tid] += retry_count; } if (!(info->flags & IEEE80211_TX_CTL_INJECTED) && acked) ieee80211_frame_acked(sta, skb); } /* SNMP counters * Fragments are passed to low-level drivers as separate skbs, so these * are actually fragments, not frames. Update frame counters only for * the first fragment of the frame. */ if ((info->flags & IEEE80211_TX_STAT_ACK) || (info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED)) { if (ieee80211_is_first_frag(hdr->seq_ctrl)) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (is_multicast_ether_addr(ieee80211_get_DA(hdr))) I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) I802_DEBUG_INC(local->dot11MultipleRetryCount); } /* This counter shall be incremented for an acknowledged MPDU * with an individual address in the address 1 field or an MPDU * with a multicast address in the address 1 field of type Data * or Management. */ if (!is_multicast_ether_addr(hdr->addr1) || ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) I802_DEBUG_INC(local->dot11TransmittedFragmentCount); } else { if (ieee80211_is_first_frag(hdr->seq_ctrl)) I802_DEBUG_INC(local->dot11FailedCount); } if (ieee80211_is_any_nullfunc(fc) && ieee80211_has_pm(fc) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && local->ps_sdata && !(local->scanning)) { if (info->flags & IEEE80211_TX_STAT_ACK) local->ps_sdata->u.mgd.flags |= IEEE80211_STA_NULLFUNC_ACKED; mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(10)); } ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp); /* this was a transmitted frame, but now we want to reuse it */ skb_orphan(skb); /* Need to make a copy before skb->cb gets cleared */ send_to_cooked = !!(info->flags & IEEE80211_TX_CTL_INJECTED) || !(ieee80211_is_data(fc)); /* * This is a bit racy but we can avoid a lot of work * with this test... */ if (!local->monitors && (!send_to_cooked || !local->cooked_mntrs)) { if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); return; } /* send to monitor interfaces */ ieee80211_tx_monitor(local, skb, retry_count, send_to_cooked, status); } void ieee80211_tx_status_skb(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_status status = { .skb = skb, .info = IEEE80211_SKB_CB(skb), }; struct sta_info *sta; rcu_read_lock(); sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2); if (sta) status.sta = &sta->sta; ieee80211_tx_status_ext(hw, &status); rcu_read_unlock(); } EXPORT_SYMBOL(ieee80211_tx_status_skb); void ieee80211_tx_status_ext(struct ieee80211_hw *hw, struct ieee80211_tx_status *status) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_tx_info *info = status->info; struct ieee80211_sta *pubsta = status->sta; struct sk_buff *skb = status->skb; struct sta_info *sta = NULL; int rates_idx, retry_count; bool acked, noack_success, ack_signal_valid; u16 tx_time_est; if (pubsta) { sta = container_of(pubsta, struct sta_info, sta); if (status->n_rates) sta->deflink.tx_stats.last_rate_info = status->rates[status->n_rates - 1].rate_idx; } if (skb && (tx_time_est = ieee80211_info_get_tx_time_est(IEEE80211_SKB_CB(skb))) > 0) { /* Do this here to avoid the expensive lookup of the sta * in ieee80211_report_used_skb(). */ ieee80211_sta_update_pending_airtime(local, sta, skb_get_queue_mapping(skb), tx_time_est, true); ieee80211_info_set_tx_time_est(IEEE80211_SKB_CB(skb), 0); } if (!status->info) goto free; rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); acked = !!(info->flags & IEEE80211_TX_STAT_ACK); noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); ack_signal_valid = !!(info->status.flags & IEEE80211_TX_STATUS_ACK_SIGNAL_VALID); if (pubsta) { struct ieee80211_sub_if_data *sdata = sta->sdata; if (!acked && !noack_success) sta->deflink.status_stats.retry_failed++; sta->deflink.status_stats.retry_count += retry_count; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { if (sdata->vif.type == NL80211_IFTYPE_STATION && skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) ieee80211_sta_tx_notify(sdata, (void *) skb->data, acked, info->status.tx_time); if (acked) { sta->deflink.status_stats.last_ack = jiffies; if (sta->deflink.status_stats.lost_packets) sta->deflink.status_stats.lost_packets = 0; /* Track when last packet was ACKed */ sta->deflink.status_stats.last_pkt_time = jiffies; /* Reset connection monitor */ if (sdata->vif.type == NL80211_IFTYPE_STATION && unlikely(sdata->u.mgd.probe_send_count > 0)) sdata->u.mgd.probe_send_count = 0; if (ack_signal_valid) { sta->deflink.status_stats.last_ack_signal = (s8)info->status.ack_signal; sta->deflink.status_stats.ack_signal_filled = true; ewma_avg_signal_add(&sta->deflink.status_stats.avg_ack_signal, -info->status.ack_signal); } } else if (test_sta_flag(sta, WLAN_STA_PS_STA)) { /* * The STA is in power save mode, so assume * that this TX packet failed because of that. */ if (skb) ieee80211_handle_filtered_frame(local, sta, skb); return; } else if (noack_success) { /* nothing to do here, do not account as lost */ } else { ieee80211_lost_packet(sta, info); } } rate_control_tx_status(local, status); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) ieee80211s_update_metric(local, sta, status); } if (skb && !(info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP)) return __ieee80211_tx_status(hw, status, rates_idx, retry_count); if (acked || noack_success) { I802_DEBUG_INC(local->dot11TransmittedFrameCount); if (!pubsta) I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); if (retry_count > 0) I802_DEBUG_INC(local->dot11RetryCount); if (retry_count > 1) I802_DEBUG_INC(local->dot11MultipleRetryCount); } else { I802_DEBUG_INC(local->dot11FailedCount); } free: if (!skb) return; ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp); if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); } EXPORT_SYMBOL(ieee80211_tx_status_ext); void ieee80211_tx_rate_update(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_tx_info *info) { struct ieee80211_local *local = hw_to_local(hw); struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_tx_status status = { .info = info, .sta = pubsta, }; rate_control_tx_status(local, &status); if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) sta->deflink.tx_stats.last_rate = info->status.rates[0]; } EXPORT_SYMBOL(ieee80211_tx_rate_update); void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); cfg80211_cqm_pktloss_notify(sta->sdata->dev, sta->sta.addr, num_packets, GFP_ATOMIC); } EXPORT_SYMBOL(ieee80211_report_low_ack); void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb) { struct ieee80211_local *local = hw_to_local(hw); ktime_t kt = ktime_set(0, 0); ieee80211_report_used_skb(local, skb, true, kt); dev_kfree_skb_any(skb); } EXPORT_SYMBOL(ieee80211_free_txskb); void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs) { struct sk_buff *skb; while ((skb = __skb_dequeue(skbs))) ieee80211_free_txskb(hw, skb); } EXPORT_SYMBOL(ieee80211_purge_tx_queue); |
8 8 3 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com> */ #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include "leds.h" DEFINE_LED_TRIGGER(bt_power_led_trigger); struct hci_basic_led_trigger { struct led_trigger led_trigger; struct hci_dev *hdev; }; #define to_hci_basic_led_trigger(arg) container_of(arg, \ struct hci_basic_led_trigger, led_trigger) void hci_leds_update_powered(struct hci_dev *hdev, bool enabled) { if (hdev->power_led) led_trigger_event(hdev->power_led, enabled ? LED_FULL : LED_OFF); if (!enabled) { struct hci_dev *d; read_lock(&hci_dev_list_lock); list_for_each_entry(d, &hci_dev_list, list) { if (test_bit(HCI_UP, &d->flags)) enabled = true; } read_unlock(&hci_dev_list_lock); } led_trigger_event(bt_power_led_trigger, enabled ? LED_FULL : LED_OFF); } static int power_activate(struct led_classdev *led_cdev) { struct hci_basic_led_trigger *htrig; bool powered; htrig = to_hci_basic_led_trigger(led_cdev->trigger); powered = test_bit(HCI_UP, &htrig->hdev->flags); led_set_brightness(led_cdev, powered ? LED_FULL : LED_OFF); return 0; } static struct led_trigger *led_allocate_basic(struct hci_dev *hdev, int (*activate)(struct led_classdev *led_cdev), const char *name) { struct hci_basic_led_trigger *htrig; htrig = devm_kzalloc(&hdev->dev, sizeof(*htrig), GFP_KERNEL); if (!htrig) return NULL; htrig->hdev = hdev; htrig->led_trigger.activate = activate; htrig->led_trigger.name = devm_kasprintf(&hdev->dev, GFP_KERNEL, "%s-%s", hdev->name, name); if (!htrig->led_trigger.name) goto err_alloc; if (devm_led_trigger_register(&hdev->dev, &htrig->led_trigger)) goto err_register; return &htrig->led_trigger; err_register: devm_kfree(&hdev->dev, (void *)htrig->led_trigger.name); err_alloc: devm_kfree(&hdev->dev, htrig); return NULL; } void hci_leds_init(struct hci_dev *hdev) { /* initialize power_led */ hdev->power_led = led_allocate_basic(hdev, power_activate, "power"); } void bt_leds_init(void) { led_trigger_register_simple("bluetooth-power", &bt_power_led_trigger); } void bt_leds_cleanup(void) { led_trigger_unregister_simple(bt_power_led_trigger); } |
300 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fs.h> #define DEVCG_ACC_MKNOD 1 #define DEVCG_ACC_READ 2 #define DEVCG_ACC_WRITE 4 #define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE) #define DEVCG_DEV_BLOCK 1 #define DEVCG_DEV_CHAR 2 #define DEVCG_DEV_ALL 4 /* this represents all devices */ #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) int devcgroup_check_permission(short type, u32 major, u32 minor, short access); static inline int devcgroup_inode_permission(struct inode *inode, int mask) { short type, access = 0; if (likely(!inode->i_rdev)) return 0; if (S_ISBLK(inode->i_mode)) type = DEVCG_DEV_BLOCK; else if (S_ISCHR(inode->i_mode)) type = DEVCG_DEV_CHAR; else return 0; if (mask & MAY_WRITE) access |= DEVCG_ACC_WRITE; if (mask & MAY_READ) access |= DEVCG_ACC_READ; return devcgroup_check_permission(type, imajor(inode), iminor(inode), access); } static inline int devcgroup_inode_mknod(int mode, dev_t dev) { short type; if (!S_ISBLK(mode) && !S_ISCHR(mode)) return 0; if (S_ISCHR(mode) && dev == WHITEOUT_DEV) return 0; if (S_ISBLK(mode)) type = DEVCG_DEV_BLOCK; else type = DEVCG_DEV_CHAR; return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), DEVCG_ACC_MKNOD); } #else static inline int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { return 0; } static inline int devcgroup_inode_permission(struct inode *inode, int mask) { return 0; } static inline int devcgroup_inode_mknod(int mode, dev_t dev) { return 0; } #endif |
4 1 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include "rds.h" #include "tcp.h" DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats) ____cacheline_aligned; static const char * const rds_tcp_stat_names[] = { "tcp_data_ready_calls", "tcp_write_space_calls", "tcp_sndbuf_full", "tcp_connect_raced", "tcp_listen_closed_stale", }; unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail) { struct rds_tcp_statistics stats = {0, }; uint64_t *src; uint64_t *sum; size_t i; int cpu; if (avail < ARRAY_SIZE(rds_tcp_stat_names)) goto out; for_each_online_cpu(cpu) { src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu)); sum = (uint64_t *)&stats; for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) *(sum++) += *(src++); } rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names, ARRAY_SIZE(rds_tcp_stat_names)); out: return ARRAY_SIZE(rds_tcp_stat_names); } |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | #ifndef _TCP_DCTCP_H #define _TCP_DCTCP_H static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state) { struct tcp_sock *tp = tcp_sk(sk); if (ce_state == 1) tp->ecn_flags |= TCP_ECN_DEMAND_CWR; else tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } /* Minimal DCTP CE state machine: * * S: 0 <- last pkt was non-CE * 1 <- last pkt was CE */ static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, u32 *prior_rcv_nxt, u32 *ce_state) { u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0; if (*ce_state != new_ce_state) { /* CE state has changed, force an immediate ACK to * reflect the new CE state. If an ACK was delayed, * send that first to reflect the prior CE state. */ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { dctcp_ece_ack_cwr(sk, *ce_state); __tcp_send_ack(sk, *prior_rcv_nxt); } inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt; *ce_state = new_ce_state; dctcp_ece_ack_cwr(sk, new_ce_state); } #endif |
84 8 81 33 15 16 16 31 11 3 10 2 6 38 37 28 28 24 2 4 1 4 1 1 1 3 1 28 33 2 30 3 4 1 1 5 28 27 6 8 5 5 2 8 26 20 6 22 22 1 22 1 23 5 19 1 25 36 29 37 14 1 3 26 1 22 12 34 34 1 29 5 33 1 7 16 29 3 1 28 23 14 37 2 4 2 12 1 78 2 2 71 50 1 3 46 49 34 48 20 19 11 48 10 12 32 11 32 38 39 13 9 4 1 22 30 50 50 41 6 39 1 31 2 31 15 17 68 2 1 10 4 25 1 1 25 1 24 2 1 5 25 1 24 3 31 33 33 35 1 38 1 40 27 6 62 8 32 43 42 6 3 3 6 6 2 1 8 5 1 4 4 1 1 8 3 5 5 1 12 9 1 23 1 22 1 1 10 9 18 79 1 1 2 1 75 72 73 18 9 8 1 7 17 12 1 1 1 3 1 1 2 1 5 1 6 3 4 3 4 1 5 4 1 5 2 5 11 2 1 1 1 7 2 5 5 16 1 1 1 1 11 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 | // SPDX-License-Identifier: GPL-2.0-only /* * "splice": joining two ropes together by interweaving their strands. * * This is the "extended pipe" functionality, where a pipe is used as * an arbitrary in-memory buffer. Think of a pipe as a small kernel * buffer that you can use to transfer data from one end to the other. * * The traditional unix read/write is extended with a "splice()" operation * that transfers data buffers to or from a pipe buffer. * * Named by Larry McVoy, original implementation from Linus, extended by * Jens to support splicing to files, network, direct splicing, etc and * fixing lots of bugs. * * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> * */ #include <linux/bvec.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/gfp.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/sched/signal.h> #include "internal.h" /* * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to * indicate they support non-blocking reads or writes, we must clear it * here if set to avoid blocking other users of this pipe if splice is * being done on it. */ static noinline void noinline pipe_clear_nowait(struct file *file) { fmode_t fmode = READ_ONCE(file->f_mode); do { if (!(fmode & FMODE_NOWAIT)) break; } while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT)); } /* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the * addition of remove_mapping(). If success is returned, the caller may * attempt to reuse this page for another destination. */ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct folio *folio = page_folio(buf->page); struct address_space *mapping; folio_lock(folio); mapping = folio_mapping(folio); if (mapping) { WARN_ON(!folio_test_uptodate(folio)); /* * At least for ext2 with nobh option, we need to wait on * writeback completing on this folio, since we'll remove it * from the pagecache. Otherwise truncate wont wait on the * folio, allowing the disk blocks to be reused by someone else * before we actually wrote our data to them. fs corruption * ensues. */ folio_wait_writeback(folio); if (!filemap_release_folio(folio, GFP_KERNEL)) goto out_unlock; /* * If we succeeded in removing the mapping, set LRU flag * and return good. */ if (remove_mapping(mapping, folio)) { buf->flags |= PIPE_BUF_FLAG_LRU; return true; } } /* * Raced with truncate or failed to remove folio from current * address space, unlock and return failure. */ out_unlock: folio_unlock(folio); return false; } static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { put_page(buf->page); buf->flags &= ~PIPE_BUF_FLAG_LRU; } /* * Check whether the contents of buf is OK to access. Since the content * is a page cache page, IO may be in flight. */ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct folio *folio = page_folio(buf->page); int err; if (!folio_test_uptodate(folio)) { folio_lock(folio); /* * Folio got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ if (!folio->mapping) { err = -ENODATA; goto error; } /* * Uh oh, read-error from disk. */ if (!folio_test_uptodate(folio)) { err = -EIO; goto error; } /* Folio is ok after all, we are done */ folio_unlock(folio); } return 0; error: folio_unlock(folio); return err; } const struct pipe_buf_operations page_cache_pipe_buf_ops = { .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .try_steal = page_cache_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) return false; buf->flags |= PIPE_BUF_FLAG_LRU; return generic_pipe_buf_try_steal(pipe, buf); } static const struct pipe_buf_operations user_page_pipe_buf_ops = { .release = page_cache_pipe_buf_release, .try_steal = user_page_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; static void wakeup_pipe_readers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->rd_wait)) wake_up_interruptible(&pipe->rd_wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } /** * splice_to_pipe - fill passed data into a pipe * @pipe: pipe to fill * @spd: data to fill * * Description: * @spd contains a map of pages and len/offset tuples, along with * the struct pipe_buf_operations associated with these pages. This * function will link that data to the pipe. * */ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int spd_pages = spd->nr_pages; unsigned int tail = pipe->tail; unsigned int head = pipe->head; unsigned int mask = pipe->ring_size - 1; ssize_t ret = 0; int page_nr = 0; if (!spd_pages) return 0; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } while (!pipe_full(head, tail, pipe->max_usage)) { struct pipe_buffer *buf = &pipe->bufs[head & mask]; buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; buf->flags = 0; head++; pipe->head = head; page_nr++; ret += buf->len; if (!--spd->nr_pages) break; } if (!ret) ret = -EAGAIN; out: while (page_nr < spd_pages) spd->spd_release(spd, page_nr++); return ret; } EXPORT_SYMBOL_GPL(splice_to_pipe); ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; int ret; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { pipe->bufs[head & mask] = *buf; pipe->head = head + 1; return buf->len; } pipe_buf_release(pipe, buf); return ret; } EXPORT_SYMBOL(add_to_pipe); /* * Check if we need to grow the arrays holding pages and partial page * descriptions. */ int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { unsigned int max_usage = READ_ONCE(pipe->max_usage); spd->nr_pages_max = max_usage; if (max_usage <= PIPE_DEF_BUFFERS) return 0; spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL); spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page), GFP_KERNEL); if (spd->pages && spd->partial) return 0; kfree(spd->pages); kfree(spd->partial); return -ENOMEM; } void splice_shrink_spd(struct splice_pipe_desc *spd) { if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) return; kfree(spd->pages); kfree(spd->partial); } /** * copy_splice_read - Copy data from a file and splice the copy into a pipe * @in: The file to read from * @ppos: Pointer to the file position to read from * @pipe: The pipe to splice into * @len: The amount to splice * @flags: The SPLICE_F_* flags * * This function allocates a bunch of pages sufficient to hold the requested * amount of data (but limited by the remaining pipe capacity), passes it to * the file's ->read_iter() to read into and then splices the used pages into * the pipe. * * Return: On success, the number of bytes read will be returned and *@ppos * will be updated if appropriate; 0 will be returned if there is no more data * to be read; -EAGAIN will be returned if the pipe had no space, and some * other negative error code will be returned on error. A short read may occur * if the pipe has insufficient space, we reach the end of the data or we hit a * hole. */ ssize_t copy_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct iov_iter to; struct bio_vec *bv; struct kiocb kiocb; struct page **pages; ssize_t ret; size_t used, npages, chunk, remain, keep = 0; int i; /* Work out how much data we can actually add into the pipe */ used = pipe_occupancy(pipe->head, pipe->tail); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); npages = DIV_ROUND_UP(len, PAGE_SIZE); bv = kzalloc(array_size(npages, sizeof(bv[0])) + array_size(npages, sizeof(struct page *)), GFP_KERNEL); if (!bv) return -ENOMEM; pages = (struct page **)(bv + npages); npages = alloc_pages_bulk_array(GFP_USER, npages, pages); if (!npages) { kfree(bv); return -ENOMEM; } remain = len = min_t(size_t, len, npages * PAGE_SIZE); for (i = 0; i < npages; i++) { chunk = min_t(size_t, PAGE_SIZE, remain); bv[i].bv_page = pages[i]; bv[i].bv_offset = 0; bv[i].bv_len = chunk; remain -= chunk; } /* Do the I/O */ iov_iter_bvec(&to, ITER_DEST, bv, npages, len); init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; ret = in->f_op->read_iter(&kiocb, &to); if (ret > 0) { keep = DIV_ROUND_UP(ret, PAGE_SIZE); *ppos = kiocb.ki_pos; } /* * Callers of ->splice_read() expect -EAGAIN on "can't put anything in * there", rather than -EFAULT. */ if (ret == -EFAULT) ret = -EAGAIN; /* Free any pages that didn't get touched at all. */ if (keep < npages) release_pages(pages + keep, npages - keep); /* Push the remaining pages into the pipe. */ remain = ret; for (i = 0; i < keep; i++) { struct pipe_buffer *buf = pipe_head_buf(pipe); chunk = min_t(size_t, remain, PAGE_SIZE); *buf = (struct pipe_buffer) { .ops = &default_pipe_buf_ops, .page = bv[i].bv_page, .offset = 0, .len = chunk, }; pipe->head++; remain -= chunk; } kfree(bv); return ret; } EXPORT_SYMBOL(copy_splice_read); const struct pipe_buf_operations default_pipe_buf_ops = { .release = generic_pipe_buf_release, .try_steal = generic_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .release = generic_pipe_buf_release, .get = generic_pipe_buf_get, }; EXPORT_SYMBOL(nosteal_pipe_buf_ops); static void wakeup_pipe_writers(struct pipe_inode_info *pipe) { smp_mb(); if (waitqueue_active(&pipe->wr_wait)) wake_up_interruptible(&pipe->wr_wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } /** * splice_from_pipe_feed - feed available data from a pipe to a file * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function loops over the pipe and calls @actor to do the * actual moving of a single struct pipe_buffer to the desired * destination. It returns when there's no more buffers left in * the pipe or if the requested number of bytes (@sd->total_len) * have been copied. It returns a positive number (one) if the * pipe needs to be filled with more data, zero if the required * number of bytes have been copied and -errno on error. * * This, together with splice_from_pipe_{begin,end,next}, may be * used to implement the functionality of __splice_from_pipe() when * locking is required around copying the pipe buffers to the * destination. */ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; int ret; while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; sd->len = buf->len; if (sd->len > sd->total_len) sd->len = sd->total_len; ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; return ret; } ret = actor(pipe, buf, sd); if (ret <= 0) return ret; buf->offset += ret; buf->len -= ret; sd->num_spliced += ret; sd->len -= ret; sd->pos += ret; sd->total_len -= ret; if (!buf->len) { pipe_buf_release(pipe, buf); tail++; pipe->tail = tail; if (pipe->files) sd->need_wakeup = true; } if (!sd->total_len) return 0; } return 1; } /* We know we have a pipe buffer, but maybe it's empty? */ static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) { unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[tail & mask]; if (unlikely(!buf->len)) { pipe_buf_release(pipe, buf); pipe->tail = tail+1; return true; } return false; } /** * splice_from_pipe_next - wait for some data to splice from * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wait for some data and return a positive * value (one) if pipe buffers are available. It will return zero * or -errno if no more data needs to be spliced. */ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) { /* * Check for signal early to make process killable when there are * always buffers available */ if (signal_pending(current)) return -ERESTARTSYS; repeat: while (pipe_empty(pipe->head, pipe->tail)) { if (!pipe->writers) return 0; if (sd->num_spliced) return 0; if (sd->flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; if (sd->need_wakeup) { wakeup_pipe_writers(pipe); sd->need_wakeup = false; } pipe_wait_readable(pipe); } if (eat_empty_buffer(pipe)) goto repeat; return 1; } /** * splice_from_pipe_begin - start splicing from pipe * @sd: information about the splice operation * * Description: * This function should be called before a loop containing * splice_from_pipe_next() and splice_from_pipe_feed() to * initialize the necessary fields of @sd. */ static void splice_from_pipe_begin(struct splice_desc *sd) { sd->num_spliced = 0; sd->need_wakeup = false; } /** * splice_from_pipe_end - finish splicing from pipe * @pipe: pipe to splice from * @sd: information about the splice operation * * Description: * This function will wake up pipe writers if necessary. It should * be called after a loop containing splice_from_pipe_next() and * splice_from_pipe_feed(). */ static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) { if (sd->need_wakeup) wakeup_pipe_writers(pipe); } /** * __splice_from_pipe - splice data from a pipe to given actor * @pipe: pipe to splice from * @sd: information to @actor * @actor: handler that splices the data * * Description: * This function does little more than loop over the pipe and call * @actor to do the actual moving of a single struct pipe_buffer to * the desired destination. See pipe_to_file, pipe_to_sendmsg, or * pipe_to_user. * */ ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { int ret; splice_from_pipe_begin(sd); do { cond_resched(); ret = splice_from_pipe_next(pipe, sd); if (ret > 0) ret = splice_from_pipe_feed(pipe, sd, actor); } while (ret > 0); splice_from_pipe_end(pipe, sd); return sd->num_spliced ? sd->num_spliced : ret; } EXPORT_SYMBOL(__splice_from_pipe); /** * splice_from_pipe - splice data from a pipe to a file * @pipe: pipe to splice from * @out: file to splice to * @ppos: position in @out * @len: how many bytes to splice * @flags: splice modifier flags * @actor: handler that splices the data * * Description: * See __splice_from_pipe. This function locks the pipe inode, * otherwise it's identical to __splice_from_pipe(). * */ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags, splice_actor *actor) { ssize_t ret; struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, actor); pipe_unlock(pipe); return ret; } /** * iter_file_splice_write - splice data from a pipe to a file * @pipe: pipe info * @out: file to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will either move or copy pages (determined by @flags options) from * the given pipe inode to the given file. * This one is ->write_iter-based. * */ ssize_t iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct splice_desc sd = { .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, }; int nbufs = pipe->max_usage; struct bio_vec *array; ssize_t ret; if (!out->f_op->write_iter) return -EINVAL; array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (unlikely(!array)) return -ENOMEM; pipe_lock(pipe); splice_from_pipe_begin(&sd); while (sd.total_len) { struct kiocb kiocb; struct iov_iter from; unsigned int head, tail, mask; size_t left; int n; ret = splice_from_pipe_next(pipe, &sd); if (ret <= 0) break; if (unlikely(nbufs < pipe->max_usage)) { kfree(array); nbufs = pipe->max_usage; array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL); if (!array) { ret = -ENOMEM; break; } } head = pipe->head; tail = pipe->tail; mask = pipe->ring_size - 1; /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t this_len = buf->len; /* zero-length bvecs are not supported, skip them */ if (!this_len) continue; this_len = min(this_len, left); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; goto done; } bvec_set_page(&array[n], buf->page, this_len, buf->offset); left -= this_len; n++; } iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); init_sync_kiocb(&kiocb, out); kiocb.ki_pos = sd.pos; ret = out->f_op->write_iter(&kiocb, &from); sd.pos = kiocb.ki_pos; if (ret <= 0) break; sd.num_spliced += ret; sd.total_len -= ret; *ppos = sd.pos; /* dismiss the fully eaten buffers, adjust the partial one */ tail = pipe->tail; while (ret) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; if (ret >= buf->len) { ret -= buf->len; buf->len = 0; pipe_buf_release(pipe, buf); tail++; pipe->tail = tail; if (pipe->files) sd.need_wakeup = true; } else { buf->offset += ret; buf->len -= ret; ret = 0; } } } done: kfree(array); splice_from_pipe_end(pipe, &sd); pipe_unlock(pipe); if (sd.num_spliced) ret = sd.num_spliced; return ret; } EXPORT_SYMBOL(iter_file_splice_write); #ifdef CONFIG_NET /** * splice_to_socket - splice data from a pipe to a socket * @pipe: pipe to splice from * @out: socket to write to * @ppos: position in @out * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will send @len bytes from the pipe to a network socket. No data copying * is involved. * */ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { struct socket *sock = sock_from_file(out); struct bio_vec bvec[16]; struct msghdr msg = {}; ssize_t ret = 0; size_t spliced = 0; bool need_wakeup = false; pipe_lock(pipe); while (len > 0) { unsigned int head, tail, mask, bc = 0; size_t remain = len; /* * Check for signal early to make process killable when there * are always buffers available */ ret = -ERESTARTSYS; if (signal_pending(current)) break; while (pipe_empty(pipe->head, pipe->tail)) { ret = 0; if (!pipe->writers) goto out; if (spliced) goto out; ret = -EAGAIN; if (flags & SPLICE_F_NONBLOCK) goto out; ret = -ERESTARTSYS; if (signal_pending(current)) goto out; if (need_wakeup) { wakeup_pipe_writers(pipe); need_wakeup = false; } pipe_wait_readable(pipe); } head = pipe->head; tail = pipe->tail; mask = pipe->ring_size - 1; while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t seg; if (!buf->len) { tail++; continue; } seg = min_t(size_t, remain, buf->len); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { if (ret == -ENODATA) ret = 0; break; } bvec_set_page(&bvec[bc++], buf->page, seg, buf->offset); remain -= seg; if (remain == 0 || bc >= ARRAY_SIZE(bvec)) break; tail++; } if (!bc) break; msg.msg_flags = MSG_SPLICE_PAGES; if (flags & SPLICE_F_MORE) msg.msg_flags |= MSG_MORE; if (remain && pipe_occupancy(pipe->head, tail) > 0) msg.msg_flags |= MSG_MORE; if (out->f_flags & O_NONBLOCK) msg.msg_flags |= MSG_DONTWAIT; iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bvec, bc, len - remain); ret = sock_sendmsg(sock, &msg); if (ret <= 0) break; spliced += ret; len -= ret; tail = pipe->tail; while (ret > 0) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t seg = min_t(size_t, ret, buf->len); buf->offset += seg; buf->len -= seg; ret -= seg; if (!buf->len) { pipe_buf_release(pipe, buf); tail++; } } if (tail != pipe->tail) { pipe->tail = tail; if (pipe->files) need_wakeup = true; } } out: pipe_unlock(pipe); if (need_wakeup) wakeup_pipe_writers(pipe); return spliced ?: ret; } #endif static int warn_unsupported(struct file *file, const char *op) { pr_debug_ratelimited( "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n", op, file, current->pid, current->comm); return -EINVAL; } /* * Attempt to initiate a splice from pipe to file. */ static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { if (unlikely(!out->f_op->splice_write)) return warn_unsupported(out, "write"); return out->f_op->splice_write(pipe, out, ppos, len, flags); } /* * Indicate to the caller that there was a premature EOF when reading from the * source and the caller didn't indicate they would be sending more data after * this. */ static void do_splice_eof(struct splice_desc *sd) { if (sd->splice_eof) sd->splice_eof(sd); } /* * Callers already called rw_verify_area() on the entire range. * No need to call it for sub ranges. */ static ssize_t do_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { unsigned int p_space; if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; if (!len) return 0; /* Don't try to read more the pipe has space for. */ p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); len = min_t(size_t, len, p_space << PAGE_SHIFT); if (unlikely(len > MAX_RW_COUNT)) len = MAX_RW_COUNT; if (unlikely(!in->f_op->splice_read)) return warn_unsupported(in, "read"); /* * O_DIRECT and DAX don't deal with the pagecache, so we allocate a * buffer, copy into it and splice that into the pipe. */ if ((in->f_flags & O_DIRECT) || IS_DAX(in->f_mapping->host)) return copy_splice_read(in, ppos, pipe, len, flags); return in->f_op->splice_read(in, ppos, pipe, len, flags); } /** * vfs_splice_read - Read data from a file and splice it into a pipe * @in: File to splice from * @ppos: Input file offset * @pipe: Pipe to splice to * @len: Number of bytes to splice * @flags: Splice modifier flags (SPLICE_F_*) * * Splice the requested amount of data from the input file to the pipe. This * is synchronous as the caller must hold the pipe lock across the entire * operation. * * If successful, it returns the amount of data spliced, 0 if it hit the EOF or * a hole and a negative error code otherwise. */ ssize_t vfs_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { ssize_t ret; ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; return do_splice_read(in, ppos, pipe, len, flags); } EXPORT_SYMBOL_GPL(vfs_splice_read); /** * splice_direct_to_actor - splices data directly between two non-pipes * @in: file to splice from * @sd: actor information on where to splice to * @actor: handles the data splicing * * Description: * This is a special case helper to splice directly between two * points, without requiring an explicit pipe. Internally an allocated * pipe is cached in the process, and reused during the lifetime of * that process. * */ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, splice_direct_actor *actor) { struct pipe_inode_info *pipe; ssize_t ret, bytes; size_t len; int i, flags, more; /* * We require the input to be seekable, as we don't want to randomly * drop data for eg socket -> socket splicing. Use the piped splicing * for that! */ if (unlikely(!(in->f_mode & FMODE_LSEEK))) return -EINVAL; /* * neither in nor out is a pipe, setup an internal pipe attached to * 'out' and transfer the wanted data from 'in' to 'out' through that */ pipe = current->splice_pipe; if (unlikely(!pipe)) { pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; /* * We don't have an immediate reader, but we'll read the stuff * out of the pipe right after the splice_to_pipe(). So set * PIPE_READERS appropriately. */ pipe->readers = 1; current->splice_pipe = pipe; } /* * Do the splice. */ bytes = 0; len = sd->total_len; /* Don't block on output, we have to drain the direct pipe. */ flags = sd->flags; sd->flags &= ~SPLICE_F_NONBLOCK; /* * We signal MORE until we've read sufficient data to fulfill the * request and we keep signalling it if the caller set it. */ more = sd->flags & SPLICE_F_MORE; sd->flags |= SPLICE_F_MORE; WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); while (len) { size_t read_len; loff_t pos = sd->pos, prev_pos = pos; ret = do_splice_read(in, &pos, pipe, len, flags); if (unlikely(ret <= 0)) goto read_failure; read_len = ret; sd->total_len = read_len; /* * If we now have sufficient data to fulfill the request then * we clear SPLICE_F_MORE if it was not set initially. */ if (read_len >= len && !more) sd->flags &= ~SPLICE_F_MORE; /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ ret = actor(pipe, sd); if (unlikely(ret <= 0)) { sd->pos = prev_pos; goto out_release; } bytes += ret; len -= ret; sd->pos = pos; if (ret < read_len) { sd->pos = prev_pos + ret; goto out_release; } } done: pipe->tail = pipe->head = 0; file_accessed(in); return bytes; read_failure: /* * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a * "->splice_in()" that returned EOF (ie zero) *and* we have sent at * least 1 byte *then* we will also do the ->splice_eof() call. */ if (ret == 0 && !more && len > 0 && bytes) do_splice_eof(sd); out_release: /* * If we did an incomplete transfer we must release * the pipe buffers in question: */ for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = &pipe->bufs[i]; if (buf->ops) pipe_buf_release(pipe, buf); } if (!bytes) bytes = ret; goto done; } EXPORT_SYMBOL(splice_direct_to_actor); static int direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; long ret; file_start_write(file); ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); file_end_write(file); return ret; } static int splice_file_range_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) { struct file *file = sd->u.file; return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } static void direct_file_splice_eof(struct splice_desc *sd) { struct file *file = sd->u.file; if (file->f_op->splice_eof) file->f_op->splice_eof(file); } static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags, splice_direct_actor *actor) { struct splice_desc sd = { .len = len, .total_len = len, .flags = flags, .pos = *ppos, .u.file = out, .splice_eof = direct_file_splice_eof, .opos = opos, }; ssize_t ret; if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; ret = splice_direct_to_actor(in, &sd, actor); if (ret > 0) *ppos = sd.pos; return ret; } /** * do_splice_direct - splices data directly between two files * @in: file to splice from * @ppos: input file offset * @out: file to splice to * @opos: output file offset * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * For use by do_sendfile(). splice can easily emulate sendfile, but * doing it in the application would incur an extra system call * (splice in + splice out, as compared to just sendfile()). So this helper * can splice directly through a process-private pipe. * * Callers already called rw_verify_area() on the entire range. */ ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len, unsigned int flags) { return do_splice_direct_actor(in, ppos, out, opos, len, flags, direct_splice_actor); } EXPORT_SYMBOL(do_splice_direct); /** * splice_file_range - splices data between two files for copy_file_range() * @in: file to splice from * @ppos: input file offset * @out: file to splice to * @opos: output file offset * @len: number of bytes to splice * * Description: * For use by ->copy_file_range() methods. * Like do_splice_direct(), but vfs_copy_file_range() already holds * start_file_write() on @out file. * * Callers already called rw_verify_area() on the entire range. */ ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out, loff_t *opos, size_t len) { lockdep_assert(file_write_started(out)); return do_splice_direct_actor(in, ppos, out, opos, min_t(size_t, len, MAX_RW_COUNT), 0, splice_file_range_actor); } EXPORT_SYMBOL(splice_file_range); static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) { for (;;) { if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); return -EPIPE; } if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; pipe_wait_writable(pipe); } } static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); ssize_t splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags) { ssize_t ret; pipe_lock(opipe); ret = wait_for_space(opipe, flags); if (!ret) ret = do_splice_read(in, offset, opipe, len, flags); pipe_unlock(opipe); if (ret > 0) wakeup_pipe_readers(opipe); return ret; } /* * Determine where to splice to/from. */ ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out, loff_t *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset; ssize_t ret; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe && opipe) { if (off_in || off_out) return -ESPIPE; /* Splicing to self would be fun, but... */ if (ipipe == opipe) return -EINVAL; if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; ret = splice_pipe_to_pipe(ipipe, opipe, len, flags); } else if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { if (!(out->f_mode & FMODE_PWRITE)) return -EINVAL; offset = *off_out; } else { offset = out->f_pos; } if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; ret = rw_verify_area(WRITE, out, &offset, len); if (unlikely(ret < 0)) return ret; if (in->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; file_start_write(out); ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); if (!off_out) out->f_pos = offset; else *off_out = offset; } else if (opipe) { if (off_out) return -ESPIPE; if (off_in) { if (!(in->f_mode & FMODE_PREAD)) return -EINVAL; offset = *off_in; } else { offset = in->f_pos; } ret = rw_verify_area(READ, in, &offset, len); if (unlikely(ret < 0)) return ret; if (out->f_flags & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; ret = splice_file_to_pipe(in, opipe, &offset, len, flags); if (!off_in) in->f_pos = offset; else *off_in = offset; } else { ret = -EINVAL; } if (ret > 0) { /* * Generate modify out before access in: * do_splice_from() may've already sent modify out, * and this ensures the events get merged. */ fsnotify_modify(out); fsnotify_access(in); } return ret; } static ssize_t __do_splice(struct file *in, loff_t __user *off_in, struct file *out, loff_t __user *off_out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; loff_t offset, *__off_in = NULL, *__off_out = NULL; ssize_t ret; ipipe = get_pipe_info(in, true); opipe = get_pipe_info(out, true); if (ipipe) { if (off_in) return -ESPIPE; pipe_clear_nowait(in); } if (opipe) { if (off_out) return -ESPIPE; pipe_clear_nowait(out); } if (off_out) { if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; __off_out = &offset; } if (off_in) { if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; __off_in = &offset; } ret = do_splice(in, __off_in, out, __off_out, len, flags); if (ret < 0) return ret; if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t))) return -EFAULT; if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t))) return -EFAULT; return ret; } static ssize_t iter_to_pipe(struct iov_iter *from, struct pipe_inode_info *pipe, unsigned int flags) { struct pipe_buffer buf = { .ops = &user_page_pipe_buf_ops, .flags = flags }; size_t total = 0; ssize_t ret = 0; while (iov_iter_count(from)) { struct page *pages[16]; ssize_t left; size_t start; int i, n; left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start); if (left <= 0) { ret = left; break; } n = DIV_ROUND_UP(left + start, PAGE_SIZE); for (i = 0; i < n; i++) { int size = min_t(int, left, PAGE_SIZE - start); buf.page = pages[i]; buf.offset = start; buf.len = size; ret = add_to_pipe(pipe, &buf); if (unlikely(ret < 0)) { iov_iter_revert(from, left); // this one got dropped by add_to_pipe() while (++i < n) put_page(pages[i]); goto out; } total += ret; left -= size; start = 0; } } out: return total ? total : ret; } static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); return n == sd->len ? n : -EFAULT; } /* * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter, unsigned int flags) { struct pipe_inode_info *pipe = get_pipe_info(file, true); struct splice_desc sd = { .total_len = iov_iter_count(iter), .flags = flags, .u.data = iter }; ssize_t ret = 0; if (!pipe) return -EBADF; pipe_clear_nowait(file); if (sd.total_len) { pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); } if (ret > 0) fsnotify_access(file); return ret; } /* * vmsplice splices a user address range into a pipe. It can be thought of * as splice-from-memory, where the regular splice is splice-from-file (or * to file). In both cases the output is a pipe, naturally. */ static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter, unsigned int flags) { struct pipe_inode_info *pipe; ssize_t ret = 0; unsigned buf_flag = 0; if (flags & SPLICE_F_GIFT) buf_flag = PIPE_BUF_FLAG_GIFT; pipe = get_pipe_info(file, true); if (!pipe) return -EBADF; pipe_clear_nowait(file); pipe_lock(pipe); ret = wait_for_space(pipe, flags); if (!ret) ret = iter_to_pipe(iter, pipe, buf_flag); pipe_unlock(pipe); if (ret > 0) { wakeup_pipe_readers(pipe); fsnotify_modify(file); } return ret; } static int vmsplice_type(struct fd f, int *type) { if (!fd_file(f)) return -EBADF; if (fd_file(f)->f_mode & FMODE_WRITE) { *type = ITER_SOURCE; } else if (fd_file(f)->f_mode & FMODE_READ) { *type = ITER_DEST; } else { fdput(f); return -EBADF; } return 0; } /* * Note that vmsplice only really supports true splicing _from_ user memory * to a pipe, not the other way around. Splicing from user memory is a simple * operation that can be supported without any funky alignment restrictions * or nasty vm tricks. We simply map in the user memory and fill them into * a pipe. The reverse isn't quite as easy, though. There are two possible * solutions for that: * * - memcpy() the data internally, at which point we might as well just * do a regular read() on the buffer anyway. * - Lots of nasty vm tricks, that are neither fast nor flexible (it * has restriction limitations on both ends of the pipe). * * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, unsigned long, nr_segs, unsigned int, flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; ssize_t error; struct fd f; int type; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; f = fdget(fd); error = vmsplice_type(f, &type); if (error) return error; error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); if (error < 0) goto out_fdput; if (!iov_iter_count(&iter)) error = 0; else if (type == ITER_SOURCE) error = vmsplice_to_pipe(fd_file(f), &iter, flags); else error = vmsplice_to_user(fd_file(f), &iter, flags); kfree(iov); out_fdput: fdput(f); return error; } SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, int, fd_out, loff_t __user *, off_out, size_t, len, unsigned int, flags) { struct fd in, out; ssize_t error; if (unlikely(!len)) return 0; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; error = -EBADF; in = fdget(fd_in); if (fd_file(in)) { out = fdget(fd_out); if (fd_file(out)) { error = __do_splice(fd_file(in), off_in, fd_file(out), off_out, len, flags); fdput(out); } fdput(in); } return error; } /* * Make sure there's data to read. Wait for input if we can, otherwise * return an appropriate error. */ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check the pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ if (!pipe_empty(pipe->head, pipe->tail)) return 0; ret = 0; pipe_lock(pipe); while (pipe_empty(pipe->head, pipe->tail)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (!pipe->writers) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } pipe_wait_readable(pipe); } pipe_unlock(pipe); return ret; } /* * Make sure there's writeable room. Wait for room if we can, otherwise * return an appropriate error. */ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; /* * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; ret = 0; pipe_lock(pipe); while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; break; } if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } pipe_wait_writable(pipe); } pipe_unlock(pipe); return ret; } /* * Splice contents of ipipe to opipe. */ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; unsigned int i_mask, o_mask; int ret = 0; bool input_wakeup = false; retry: ret = ipipe_prep(ipipe, flags); if (ret) return ret; ret = opipe_prep(opipe, flags); if (ret) return ret; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; i_mask = ipipe->ring_size - 1; o_head = opipe->head; o_mask = opipe->ring_size - 1; do { size_t o_len; if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } i_head = ipipe->head; o_tail = opipe->tail; if (pipe_empty(i_head, i_tail) && !ipipe->writers) break; /* * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ if (pipe_empty(i_head, i_tail) || pipe_full(o_head, o_tail, opipe->max_usage)) { /* Already processed some buffers, break */ if (ret) break; if (flags & SPLICE_F_NONBLOCK) { ret = -EAGAIN; break; } /* * We raced with another reader/writer and haven't * managed to process any buffers. A zero return * value means EOF, so retry instead. */ pipe_unlock(ipipe); pipe_unlock(opipe); goto retry; } ibuf = &ipipe->bufs[i_tail & i_mask]; obuf = &opipe->bufs[o_head & o_mask]; if (len >= ibuf->len) { /* * Simply move the whole buffer from ipipe to opipe */ *obuf = *ibuf; ibuf->ops = NULL; i_tail++; ipipe->tail = i_tail; input_wakeup = true; o_len = obuf->len; o_head++; opipe->head = o_head; } else { /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ if (!pipe_buf_get(ipipe, ibuf)) { if (ret == 0) ret = -EFAULT; break; } *obuf = *ibuf; /* * Don't inherit the gift and merge flags, we need to * prevent multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; obuf->len = len; ibuf->offset += len; ibuf->len -= len; o_len = len; o_head++; opipe->head = o_head; } ret += o_len; len -= o_len; } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) wakeup_pipe_readers(opipe); if (input_wakeup) wakeup_pipe_writers(ipipe); return ret; } /* * Link contents of ipipe to opipe. */ static ssize_t link_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; unsigned int i_mask, o_mask; ssize_t ret = 0; /* * Potential ABBA deadlock, work around it by ordering lock * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; i_mask = ipipe->ring_size - 1; o_head = opipe->head; o_mask = opipe->ring_size - 1; do { if (!opipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } i_head = ipipe->head; o_tail = opipe->tail; /* * If we have iterated all input buffers or run out of * output room, break. */ if (pipe_empty(i_head, i_tail) || pipe_full(o_head, o_tail, opipe->max_usage)) break; ibuf = &ipipe->bufs[i_tail & i_mask]; obuf = &opipe->bufs[o_head & o_mask]; /* * Get a reference to this pipe buffer, * so we can copy the contents over. */ if (!pipe_buf_get(ipipe, ibuf)) { if (ret == 0) ret = -EFAULT; break; } *obuf = *ibuf; /* * Don't inherit the gift and merge flag, we need to prevent * multiple steals of this page. */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; if (obuf->len > len) obuf->len = len; ret += obuf->len; len -= obuf->len; o_head++; opipe->head = o_head; i_tail++; } while (len); pipe_unlock(ipipe); pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. */ if (ret > 0) wakeup_pipe_readers(opipe); return ret; } /* * This is a tee(1) implementation that works on pipes. It doesn't copy * any data, it simply references the 'in' pages on the 'out' pipe. * The 'flags' used are the SPLICE_F_* variants, currently the only * applicable one is SPLICE_F_NONBLOCK. */ ssize_t do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe = get_pipe_info(in, true); struct pipe_inode_info *opipe = get_pipe_info(out, true); ssize_t ret = -EINVAL; if (unlikely(!(in->f_mode & FMODE_READ) || !(out->f_mode & FMODE_WRITE))) return -EBADF; /* * Duplicate the contents of ipipe to opipe without actually * copying the data. */ if (ipipe && opipe && ipipe != opipe) { if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; /* * Keep going, unless we encounter an error. The ipipe/opipe * ordering doesn't really matter. */ ret = ipipe_prep(ipipe, flags); if (!ret) { ret = opipe_prep(opipe, flags); if (!ret) ret = link_pipe(ipipe, opipe, len, flags); } } if (ret > 0) { fsnotify_access(in); fsnotify_modify(out); } return ret; } SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { struct fd in, out; ssize_t error; if (unlikely(flags & ~SPLICE_F_ALL)) return -EINVAL; if (unlikely(!len)) return 0; error = -EBADF; in = fdget(fdin); if (fd_file(in)) { out = fdget(fdout); if (fd_file(out)) { error = do_tee(fd_file(in), fd_file(out), len, flags); fdput(out); } fdput(in); } return error; } |
1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 | /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #ifndef RXE_QUEUE_H #define RXE_QUEUE_H /* Implements a simple circular buffer that is shared between user * and the driver and can be resized. The requested element size is * rounded up to a power of 2 and the number of elements in the buffer * is also rounded up to a power of 2. Since the queue is empty when * the producer and consumer indices match the maximum capacity of the * queue is one less than the number of element slots. * * Notes: * - The driver indices are always masked off to q->index_mask * before storing so do not need to be checked on reads. * - The user whether user space or kernel is generally * not trusted so its parameters are masked to make sure * they do not access the queue out of bounds on reads. * - The driver indices for queues must not be written * by user so a local copy is used and a shared copy is * stored when the local copy is changed. * - By passing the type in the parameter list separate from q * the compiler can eliminate the switch statement when the * actual queue type is known when the function is called at * compile time. * - These queues are lock free. The user and driver must protect * changes to their end of the queues with locks if more than one * CPU can be accessing it at the same time. */ /** * enum queue_type - type of queue * @QUEUE_TYPE_TO_CLIENT: Queue is written by rxe driver and * read by client which may be a user space * application or a kernel ulp. * Used by rxe internals only. * @QUEUE_TYPE_FROM_CLIENT: Queue is written by client and * read by rxe driver. * Used by rxe internals only. * @QUEUE_TYPE_FROM_ULP: Queue is written by kernel ulp and * read by rxe driver. * Used by kernel verbs APIs only on * behalf of ulps. * @QUEUE_TYPE_TO_ULP: Queue is written by rxe driver and * read by kernel ulp. * Used by kernel verbs APIs only on * behalf of ulps. */ enum queue_type { QUEUE_TYPE_TO_CLIENT, QUEUE_TYPE_FROM_CLIENT, QUEUE_TYPE_FROM_ULP, QUEUE_TYPE_TO_ULP, }; struct rxe_queue_buf; struct rxe_queue { struct rxe_dev *rxe; struct rxe_queue_buf *buf; struct rxe_mmap_info *ip; size_t buf_size; size_t elem_size; unsigned int log2_elem_size; u32 index_mask; enum queue_type type; /* private copy of index for shared queues between * driver and clients. Driver reads and writes * this copy and then replicates to rxe_queue_buf * for read access by clients. */ u32 index; }; int do_mmap_info(struct rxe_dev *rxe, struct mminfo __user *outbuf, struct ib_udata *udata, struct rxe_queue_buf *buf, size_t buf_size, struct rxe_mmap_info **ip_p); void rxe_queue_reset(struct rxe_queue *q); struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe, int *num_elem, unsigned int elem_size, enum queue_type type); int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p, unsigned int elem_size, struct ib_udata *udata, struct mminfo __user *outbuf, spinlock_t *producer_lock, spinlock_t *consumer_lock); void rxe_queue_cleanup(struct rxe_queue *queue); static inline u32 queue_next_index(struct rxe_queue *q, int index) { return (index + 1) & q->index_mask; } static inline u32 queue_get_producer(const struct rxe_queue *q, enum queue_type type) { u32 prod; switch (type) { case QUEUE_TYPE_FROM_CLIENT: /* used by rxe, client owns the index */ prod = smp_load_acquire(&q->buf->producer_index); break; case QUEUE_TYPE_TO_CLIENT: /* used by rxe which owns the index */ prod = q->index; break; case QUEUE_TYPE_FROM_ULP: /* used by ulp which owns the index */ prod = q->buf->producer_index; break; case QUEUE_TYPE_TO_ULP: /* used by ulp, rxe owns the index */ prod = smp_load_acquire(&q->buf->producer_index); break; } return prod; } static inline u32 queue_get_consumer(const struct rxe_queue *q, enum queue_type type) { u32 cons; switch (type) { case QUEUE_TYPE_FROM_CLIENT: /* used by rxe which owns the index */ cons = q->index; break; case QUEUE_TYPE_TO_CLIENT: /* used by rxe, client owns the index */ cons = smp_load_acquire(&q->buf->consumer_index); break; case QUEUE_TYPE_FROM_ULP: /* used by ulp, rxe owns the index */ cons = smp_load_acquire(&q->buf->consumer_index); break; case QUEUE_TYPE_TO_ULP: /* used by ulp which owns the index */ cons = q->buf->consumer_index; break; } return cons; } static inline int queue_empty(struct rxe_queue *q, enum queue_type type) { u32 prod = queue_get_producer(q, type); u32 cons = queue_get_consumer(q, type); return ((prod - cons) & q->index_mask) == 0; } static inline int queue_full(struct rxe_queue *q, enum queue_type type) { u32 prod = queue_get_producer(q, type); u32 cons = queue_get_consumer(q, type); return ((prod + 1 - cons) & q->index_mask) == 0; } static inline u32 queue_count(const struct rxe_queue *q, enum queue_type type) { u32 prod = queue_get_producer(q, type); u32 cons = queue_get_consumer(q, type); return (prod - cons) & q->index_mask; } static inline void queue_advance_producer(struct rxe_queue *q, enum queue_type type) { u32 prod; switch (type) { case QUEUE_TYPE_FROM_CLIENT: /* used by rxe, client owns the index */ if (WARN_ON(1)) pr_warn("%s: attempt to advance client index\n", __func__); break; case QUEUE_TYPE_TO_CLIENT: /* used by rxe which owns the index */ prod = q->index; prod = (prod + 1) & q->index_mask; q->index = prod; /* release so client can read it safely */ smp_store_release(&q->buf->producer_index, prod); break; case QUEUE_TYPE_FROM_ULP: /* used by ulp which owns the index */ prod = q->buf->producer_index; prod = (prod + 1) & q->index_mask; /* release so rxe can read it safely */ smp_store_release(&q->buf->producer_index, prod); break; case QUEUE_TYPE_TO_ULP: /* used by ulp, rxe owns the index */ if (WARN_ON(1)) pr_warn("%s: attempt to advance driver index\n", __func__); break; } } static inline void queue_advance_consumer(struct rxe_queue *q, enum queue_type type) { u32 cons; switch (type) { case QUEUE_TYPE_FROM_CLIENT: /* used by rxe which owns the index */ cons = (q->index + 1) & q->index_mask; q->index = cons; /* release so client can read it safely */ smp_store_release(&q->buf->consumer_index, cons); break; case QUEUE_TYPE_TO_CLIENT: /* used by rxe, client owns the index */ if (WARN_ON(1)) pr_warn("%s: attempt to advance client index\n", __func__); break; case QUEUE_TYPE_FROM_ULP: /* used by ulp, rxe owns the index */ if (WARN_ON(1)) pr_warn("%s: attempt to advance driver index\n", __func__); break; case QUEUE_TYPE_TO_ULP: /* used by ulp which owns the index */ cons = q->buf->consumer_index; cons = (cons + 1) & q->index_mask; /* release so rxe can read it safely */ smp_store_release(&q->buf->consumer_index, cons); break; } } static inline void *queue_producer_addr(struct rxe_queue *q, enum queue_type type) { u32 prod = queue_get_producer(q, type); return q->buf->data + (prod << q->log2_elem_size); } static inline void *queue_consumer_addr(struct rxe_queue *q, enum queue_type type) { u32 cons = queue_get_consumer(q, type); return q->buf->data + (cons << q->log2_elem_size); } static inline void *queue_addr_from_index(struct rxe_queue *q, u32 index) { return q->buf->data + ((index & q->index_mask) << q->log2_elem_size); } static inline u32 queue_index_from_addr(const struct rxe_queue *q, const void *addr) { return (((u8 *)addr - q->buf->data) >> q->log2_elem_size) & q->index_mask; } static inline void *queue_head(struct rxe_queue *q, enum queue_type type) { return queue_empty(q, type) ? NULL : queue_consumer_addr(q, type); } #endif /* RXE_QUEUE_H */ |
20 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 | /* * Copyright IBM Corporation, 2012 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * */ #ifndef _LINUX_HUGETLB_CGROUP_H #define _LINUX_HUGETLB_CGROUP_H #include <linux/mmdebug.h> struct hugetlb_cgroup; struct resv_map; struct file_region; #ifdef CONFIG_CGROUP_HUGETLB enum hugetlb_memory_event { HUGETLB_MAX, HUGETLB_NR_MEMORY_EVENTS, }; struct hugetlb_cgroup_per_node { /* hugetlb usage in pages over all hstates. */ unsigned long usage[HUGE_MAX_HSTATE]; }; struct hugetlb_cgroup { struct cgroup_subsys_state css; /* * the counter to account for hugepages from hugetlb. */ struct page_counter hugepage[HUGE_MAX_HSTATE]; /* * the counter to account for hugepage reservations from hugetlb. */ struct page_counter rsvd_hugepage[HUGE_MAX_HSTATE]; atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS]; atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS]; /* Handle for "hugetlb.events" */ struct cgroup_file events_file[HUGE_MAX_HSTATE]; /* Handle for "hugetlb.events.local" */ struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; struct hugetlb_cgroup_per_node *nodeinfo[]; }; static inline struct hugetlb_cgroup * __hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); if (rsvd) return folio->_hugetlb_cgroup_rsvd; else return folio->_hugetlb_cgroup; } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) { return __hugetlb_cgroup_from_folio(folio, false); } static inline struct hugetlb_cgroup * hugetlb_cgroup_from_folio_rsvd(struct folio *folio) { return __hugetlb_cgroup_from_folio(folio, true); } static inline void __set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg, bool rsvd) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); if (rsvd) folio->_hugetlb_cgroup_rsvd = h_cg; else folio->_hugetlb_cgroup = h_cg; } static inline void set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg) { __set_hugetlb_cgroup(folio, h_cg, false); } static inline void set_hugetlb_cgroup_rsvd(struct folio *folio, struct hugetlb_cgroup *h_cg) { __set_hugetlb_cgroup(folio, h_cg, true); } static inline bool hugetlb_cgroup_disabled(void) { return !cgroup_subsys_enabled(hugetlb_cgrp_subsys); } static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg) { css_put(&h_cg->css); } static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( struct resv_map *resv_map) { if (resv_map->css) css_get(resv_map->css); } static inline void resv_map_put_hugetlb_cgroup_uncharge_info( struct resv_map *resv_map) { if (resv_map->css) css_put(resv_map->css); } extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr); extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio); extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio); extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio); extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, struct folio *folio); extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); extern void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); extern void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, unsigned long end); extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, unsigned long nr_pages, bool region_del); extern void hugetlb_cgroup_file_init(void) __init; extern void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio); #else static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, unsigned long nr_pages, bool region_del) { } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) { return NULL; } static inline struct hugetlb_cgroup * hugetlb_cgroup_from_folio_rsvd(struct folio *folio) { return NULL; } static inline void set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg) { } static inline void set_hugetlb_cgroup_rsvd(struct folio *folio, struct hugetlb_cgroup *h_cg) { } static inline bool hugetlb_cgroup_disabled(void) { return true; } static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg) { } static inline void resv_map_dup_hugetlb_cgroup_uncharge_info( struct resv_map *resv_map) { } static inline void resv_map_put_hugetlb_cgroup_uncharge_info( struct resv_map *resv_map) { } static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return 0; } static inline int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return 0; } static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio) { } static inline void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio) { } static inline void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio) { } static inline void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, struct folio *folio) { } static inline void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { } static inline void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { } static inline void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, unsigned long end) { } static inline void hugetlb_cgroup_file_init(void) { } static inline void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) { } #endif /* CONFIG_MEM_RES_CTLR_HUGETLB */ #endif |
12 12 12 12 12 12 12 12 12 12 12 12 1 12 12 12 12 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 | /* * POSIX message queues filesystem for Linux. * * Copyright (C) 2003,2004 Krzysztof Benedyczak (golbi@mat.uni.torun.pl) * Michal Wronski (michal.wronski@gmail.com) * * Spinlocks: Mohamed Abbas (abbas.mohamed@intel.com) * Lockless receive & send, fd based notify: * Manfred Spraul (manfred@colorfullife.com) * * Audit: George Wilson (ltcgcw@us.ibm.com) * * This file is released under the GPL. */ #include <linux/capability.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/namei.h> #include <linux/sysctl.h> #include <linux/poll.h> #include <linux/mqueue.h> #include <linux/msg.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/netlink.h> #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/signal.h> #include <linux/mutex.h> #include <linux/nsproxy.h> #include <linux/pid.h> #include <linux/ipc_namespace.h> #include <linux/user_namespace.h> #include <linux/slab.h> #include <linux/sched/wake_q.h> #include <linux/sched/signal.h> #include <linux/sched/user.h> #include <net/sock.h> #include "util.h" struct mqueue_fs_context { struct ipc_namespace *ipc_ns; bool newns; /* Set if newly created ipc namespace */ }; #define MQUEUE_MAGIC 0x19800202 #define DIRENT_SIZE 20 #define FILENT_SIZE 80 #define SEND 0 #define RECV 1 #define STATE_NONE 0 #define STATE_READY 1 struct posix_msg_tree_node { struct rb_node rb_node; struct list_head msg_list; int priority; }; /* * Locking: * * Accesses to a message queue are synchronized by acquiring info->lock. * * There are two notable exceptions: * - The actual wakeup of a sleeping task is performed using the wake_q * framework. info->lock is already released when wake_up_q is called. * - The exit codepaths after sleeping check ext_wait_queue->state without * any locks. If it is STATE_READY, then the syscall is completed without * acquiring info->lock. * * MQ_BARRIER: * To achieve proper release/acquire memory barrier pairing, the state is set to * STATE_READY with smp_store_release(), and it is read with READ_ONCE followed * by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used. * * This prevents the following races: * * 1) With the simple wake_q_add(), the task could be gone already before * the increase of the reference happens * Thread A * Thread B * WRITE_ONCE(wait.state, STATE_NONE); * schedule_hrtimeout() * wake_q_add(A) * if (cmpxchg()) // success * ->state = STATE_READY (reordered) * <timeout returns> * if (wait.state == STATE_READY) return; * sysret to user space * sys_exit() * get_task_struct() // UaF * * Solution: Use wake_q_add_safe() and perform the get_task_struct() before * the smp_store_release() that does ->state = STATE_READY. * * 2) Without proper _release/_acquire barriers, the woken up task * could read stale data * * Thread A * Thread B * do_mq_timedreceive * WRITE_ONCE(wait.state, STATE_NONE); * schedule_hrtimeout() * state = STATE_READY; * <timeout returns> * if (wait.state == STATE_READY) return; * msg_ptr = wait.msg; // Access to stale data! * receiver->msg = message; (reordered) * * Solution: use _release and _acquire barriers. * * 3) There is intentionally no barrier when setting current->state * to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the * release memory barrier, and the wakeup is triggered when holding * info->lock, i.e. spin_lock(&info->lock) provided a pairing * acquire memory barrier. */ struct ext_wait_queue { /* queue of sleeping tasks */ struct task_struct *task; struct list_head list; struct msg_msg *msg; /* ptr of loaded message */ int state; /* one of STATE_* values */ }; struct mqueue_inode_info { spinlock_t lock; struct inode vfs_inode; wait_queue_head_t wait_q; struct rb_root msg_tree; struct rb_node *msg_tree_rightmost; struct posix_msg_tree_node *node_cache; struct mq_attr attr; struct sigevent notify; struct pid *notify_owner; u32 notify_self_exec_id; struct user_namespace *notify_user_ns; struct ucounts *ucounts; /* user who created, for accounting */ struct sock *notify_sock; struct sk_buff *notify_cookie; /* for tasks waiting for free space and messages, respectively */ struct ext_wait_queue e_wait_q[2]; unsigned long qsize; /* size of queue in memory (sum of all msgs) */ }; static struct file_system_type mqueue_fs_type; static const struct inode_operations mqueue_dir_inode_operations; static const struct file_operations mqueue_file_operations; static const struct super_operations mqueue_super_ops; static const struct fs_context_operations mqueue_fs_context_ops; static void remove_notification(struct mqueue_inode_info *info); static struct kmem_cache *mqueue_inode_cachep; static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode) { return container_of(inode, struct mqueue_inode_info, vfs_inode); } /* * This routine should be called with the mq_lock held. */ static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode) { return get_ipc_ns(inode->i_sb->s_fs_info); } static struct ipc_namespace *get_ns_from_inode(struct inode *inode) { struct ipc_namespace *ns; spin_lock(&mq_lock); ns = __get_ns_from_inode(inode); spin_unlock(&mq_lock); return ns; } /* Auxiliary functions to manipulate messages' list */ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) { struct rb_node **p, *parent = NULL; struct posix_msg_tree_node *leaf; bool rightmost = true; p = &info->msg_tree.rb_node; while (*p) { parent = *p; leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node); if (likely(leaf->priority == msg->m_type)) goto insert_msg; else if (msg->m_type < leaf->priority) { p = &(*p)->rb_left; rightmost = false; } else p = &(*p)->rb_right; } if (info->node_cache) { leaf = info->node_cache; info->node_cache = NULL; } else { leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC); if (!leaf) return -ENOMEM; INIT_LIST_HEAD(&leaf->msg_list); } leaf->priority = msg->m_type; if (rightmost) info->msg_tree_rightmost = &leaf->rb_node; rb_link_node(&leaf->rb_node, parent, p); rb_insert_color(&leaf->rb_node, &info->msg_tree); insert_msg: info->attr.mq_curmsgs++; info->qsize += msg->m_ts; list_add_tail(&msg->m_list, &leaf->msg_list); return 0; } static inline void msg_tree_erase(struct posix_msg_tree_node *leaf, struct mqueue_inode_info *info) { struct rb_node *node = &leaf->rb_node; if (info->msg_tree_rightmost == node) info->msg_tree_rightmost = rb_prev(node); rb_erase(node, &info->msg_tree); if (info->node_cache) kfree(leaf); else info->node_cache = leaf; } static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) { struct rb_node *parent = NULL; struct posix_msg_tree_node *leaf; struct msg_msg *msg; try_again: /* * During insert, low priorities go to the left and high to the * right. On receive, we want the highest priorities first, so * walk all the way to the right. */ parent = info->msg_tree_rightmost; if (!parent) { if (info->attr.mq_curmsgs) { pr_warn_once("Inconsistency in POSIX message queue, " "no tree element, but supposedly messages " "should exist!\n"); info->attr.mq_curmsgs = 0; } return NULL; } leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node); if (unlikely(list_empty(&leaf->msg_list))) { pr_warn_once("Inconsistency in POSIX message queue, " "empty leaf node but we haven't implemented " "lazy leaf delete!\n"); msg_tree_erase(leaf, info); goto try_again; } else { msg = list_first_entry(&leaf->msg_list, struct msg_msg, m_list); list_del(&msg->m_list); if (list_empty(&leaf->msg_list)) { msg_tree_erase(leaf, info); } } info->attr.mq_curmsgs--; info->qsize -= msg->m_ts; return msg; } static struct inode *mqueue_get_inode(struct super_block *sb, struct ipc_namespace *ipc_ns, umode_t mode, struct mq_attr *attr) { struct inode *inode; int ret = -ENOMEM; inode = new_inode(sb); if (!inode) goto err; inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); simple_inode_init_ts(inode); if (S_ISREG(mode)) { struct mqueue_inode_info *info; unsigned long mq_bytes, mq_treesize; inode->i_fop = &mqueue_file_operations; inode->i_size = FILENT_SIZE; /* mqueue specific info */ info = MQUEUE_I(inode); spin_lock_init(&info->lock); init_waitqueue_head(&info->wait_q); INIT_LIST_HEAD(&info->e_wait_q[0].list); INIT_LIST_HEAD(&info->e_wait_q[1].list); info->notify_owner = NULL; info->notify_user_ns = NULL; info->qsize = 0; info->ucounts = NULL; /* set when all is ok */ info->msg_tree = RB_ROOT; info->msg_tree_rightmost = NULL; info->node_cache = NULL; memset(&info->attr, 0, sizeof(info->attr)); info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max, ipc_ns->mq_msg_default); info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max, ipc_ns->mq_msgsize_default); if (attr) { info->attr.mq_maxmsg = attr->mq_maxmsg; info->attr.mq_msgsize = attr->mq_msgsize; } /* * We used to allocate a static array of pointers and account * the size of that array as well as one msg_msg struct per * possible message into the queue size. That's no longer * accurate as the queue is now an rbtree and will grow and * shrink depending on usage patterns. We can, however, still * account one msg_msg struct per message, but the nodes are * allocated depending on priority usage, and most programs * only use one, or a handful, of priorities. However, since * this is pinned memory, we need to assume worst case, so * that means the min(mq_maxmsg, max_priorities) * struct * posix_msg_tree_node. */ ret = -EINVAL; if (info->attr.mq_maxmsg <= 0 || info->attr.mq_msgsize <= 0) goto out_inode; if (capable(CAP_SYS_RESOURCE)) { if (info->attr.mq_maxmsg > HARD_MSGMAX || info->attr.mq_msgsize > HARD_MSGSIZEMAX) goto out_inode; } else { if (info->attr.mq_maxmsg > ipc_ns->mq_msg_max || info->attr.mq_msgsize > ipc_ns->mq_msgsize_max) goto out_inode; } ret = -EOVERFLOW; /* check for overflow */ if (info->attr.mq_msgsize > ULONG_MAX/info->attr.mq_maxmsg) goto out_inode; mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * sizeof(struct posix_msg_tree_node); mq_bytes = info->attr.mq_maxmsg * info->attr.mq_msgsize; if (mq_bytes + mq_treesize < mq_bytes) goto out_inode; mq_bytes += mq_treesize; info->ucounts = get_ucounts(current_ucounts()); if (info->ucounts) { long msgqueue; spin_lock(&mq_lock); msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) { dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); spin_unlock(&mq_lock); put_ucounts(info->ucounts); info->ucounts = NULL; /* mqueue_evict_inode() releases info->messages */ ret = -EMFILE; goto out_inode; } spin_unlock(&mq_lock); } } else if (S_ISDIR(mode)) { inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ inode->i_size = 2 * DIRENT_SIZE; inode->i_op = &mqueue_dir_inode_operations; inode->i_fop = &simple_dir_operations; } return inode; out_inode: iput(inode); err: return ERR_PTR(ret); } static int mqueue_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct ipc_namespace *ns = sb->s_fs_info; sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = MQUEUE_MAGIC; sb->s_op = &mqueue_super_ops; inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; return 0; } static int mqueue_get_tree(struct fs_context *fc) { struct mqueue_fs_context *ctx = fc->fs_private; /* * With a newly created ipc namespace, we don't need to do a search * for an ipc namespace match, but we still need to set s_fs_info. */ if (ctx->newns) { fc->s_fs_info = ctx->ipc_ns; return get_tree_nodev(fc, mqueue_fill_super); } return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns); } static void mqueue_fs_context_free(struct fs_context *fc) { struct mqueue_fs_context *ctx = fc->fs_private; put_ipc_ns(ctx->ipc_ns); kfree(ctx); } static int mqueue_init_fs_context(struct fs_context *fc) { struct mqueue_fs_context *ctx; ctx = kzalloc(sizeof(struct mqueue_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns); put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns); fc->fs_private = ctx; fc->ops = &mqueue_fs_context_ops; return 0; } /* * mq_init_ns() is currently the only caller of mq_create_mount(). * So the ns parameter is always a newly created ipc namespace. */ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) { struct mqueue_fs_context *ctx; struct fs_context *fc; struct vfsmount *mnt; fc = fs_context_for_mount(&mqueue_fs_type, SB_KERNMOUNT); if (IS_ERR(fc)) return ERR_CAST(fc); ctx = fc->fs_private; ctx->newns = true; put_ipc_ns(ctx->ipc_ns); ctx->ipc_ns = get_ipc_ns(ns); put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns); mnt = fc_mount(fc); put_fs_context(fc); return mnt; } static void init_once(void *foo) { struct mqueue_inode_info *p = foo; inode_init_once(&p->vfs_inode); } static struct inode *mqueue_alloc_inode(struct super_block *sb) { struct mqueue_inode_info *ei; ei = alloc_inode_sb(sb, mqueue_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void mqueue_free_inode(struct inode *inode) { kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode)); } static void mqueue_evict_inode(struct inode *inode) { struct mqueue_inode_info *info; struct ipc_namespace *ipc_ns; struct msg_msg *msg, *nmsg; LIST_HEAD(tmp_msg); clear_inode(inode); if (S_ISDIR(inode->i_mode)) return; ipc_ns = get_ns_from_inode(inode); info = MQUEUE_I(inode); spin_lock(&info->lock); while ((msg = msg_get(info)) != NULL) list_add_tail(&msg->m_list, &tmp_msg); kfree(info->node_cache); spin_unlock(&info->lock); list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) { list_del(&msg->m_list); free_msg(msg); } if (info->ucounts) { unsigned long mq_bytes, mq_treesize; /* Total amount of bytes accounted for the mqueue */ mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * sizeof(struct posix_msg_tree_node); mq_bytes = mq_treesize + (info->attr.mq_maxmsg * info->attr.mq_msgsize); spin_lock(&mq_lock); dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); /* * get_ns_from_inode() ensures that the * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns * to which we now hold a reference, or it is NULL. * We can't put it here under mq_lock, though. */ if (ipc_ns) ipc_ns->mq_queues_count--; spin_unlock(&mq_lock); put_ucounts(info->ucounts); info->ucounts = NULL; } if (ipc_ns) put_ipc_ns(ipc_ns); } static int mqueue_create_attr(struct dentry *dentry, umode_t mode, void *arg) { struct inode *dir = dentry->d_parent->d_inode; struct inode *inode; struct mq_attr *attr = arg; int error; struct ipc_namespace *ipc_ns; spin_lock(&mq_lock); ipc_ns = __get_ns_from_inode(dir); if (!ipc_ns) { error = -EACCES; goto out_unlock; } if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max && !capable(CAP_SYS_RESOURCE)) { error = -ENOSPC; goto out_unlock; } ipc_ns->mq_queues_count++; spin_unlock(&mq_lock); inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr); if (IS_ERR(inode)) { error = PTR_ERR(inode); spin_lock(&mq_lock); ipc_ns->mq_queues_count--; goto out_unlock; } put_ipc_ns(ipc_ns); dir->i_size += DIRENT_SIZE; simple_inode_init_ts(dir); d_instantiate(dentry, inode); dget(dentry); return 0; out_unlock: spin_unlock(&mq_lock); if (ipc_ns) put_ipc_ns(ipc_ns); return error; } static int mqueue_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return mqueue_create_attr(dentry, mode, NULL); } static int mqueue_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); simple_inode_init_ts(dir); dir->i_size -= DIRENT_SIZE; drop_nlink(inode); dput(dentry); return 0; } /* * This is routine for system read from queue file. * To avoid mess with doing here some sort of mq_receive we allow * to read only queue size & notification info (the only values * that are interesting from user point of view and aren't accessible * through std routines) */ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, size_t count, loff_t *off) { struct inode *inode = file_inode(filp); struct mqueue_inode_info *info = MQUEUE_I(inode); char buffer[FILENT_SIZE]; ssize_t ret; spin_lock(&info->lock); snprintf(buffer, sizeof(buffer), "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n", info->qsize, info->notify_owner ? info->notify.sigev_notify : 0, (info->notify_owner && info->notify.sigev_notify == SIGEV_SIGNAL) ? info->notify.sigev_signo : 0, pid_vnr(info->notify_owner)); spin_unlock(&info->lock); buffer[sizeof(buffer)-1] = '\0'; ret = simple_read_from_buffer(u_data, count, off, buffer, strlen(buffer)); if (ret <= 0) return ret; inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); return ret; } static int mqueue_flush_file(struct file *filp, fl_owner_t id) { struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp)); spin_lock(&info->lock); if (task_tgid(current) == info->notify_owner) remove_notification(info); spin_unlock(&info->lock); return 0; } static __poll_t mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab) { struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp)); __poll_t retval = 0; poll_wait(filp, &info->wait_q, poll_tab); spin_lock(&info->lock); if (info->attr.mq_curmsgs) retval = EPOLLIN | EPOLLRDNORM; if (info->attr.mq_curmsgs < info->attr.mq_maxmsg) retval |= EPOLLOUT | EPOLLWRNORM; spin_unlock(&info->lock); return retval; } /* Adds current to info->e_wait_q[sr] before element with smaller prio */ static void wq_add(struct mqueue_inode_info *info, int sr, struct ext_wait_queue *ewp) { struct ext_wait_queue *walk; list_for_each_entry(walk, &info->e_wait_q[sr].list, list) { if (walk->task->prio <= current->prio) { list_add_tail(&ewp->list, &walk->list); return; } } list_add_tail(&ewp->list, &info->e_wait_q[sr].list); } /* * Puts current task to sleep. Caller must hold queue lock. After return * lock isn't held. * sr: SEND or RECV */ static int wq_sleep(struct mqueue_inode_info *info, int sr, ktime_t *timeout, struct ext_wait_queue *ewp) __releases(&info->lock) { int retval; signed long time; wq_add(info, sr, ewp); for (;;) { /* memory barrier not required, we hold info->lock */ __set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&info->lock); time = schedule_hrtimeout_range_clock(timeout, 0, HRTIMER_MODE_ABS, CLOCK_REALTIME); if (READ_ONCE(ewp->state) == STATE_READY) { /* see MQ_BARRIER for purpose/pairing */ smp_acquire__after_ctrl_dep(); retval = 0; goto out; } spin_lock(&info->lock); /* we hold info->lock, so no memory barrier required */ if (READ_ONCE(ewp->state) == STATE_READY) { retval = 0; goto out_unlock; } if (signal_pending(current)) { retval = -ERESTARTSYS; break; } if (time == 0) { retval = -ETIMEDOUT; break; } } list_del(&ewp->list); out_unlock: spin_unlock(&info->lock); out: return retval; } /* * Returns waiting task that should be serviced first or NULL if none exists */ static struct ext_wait_queue *wq_get_first_waiter( struct mqueue_inode_info *info, int sr) { struct list_head *ptr; ptr = info->e_wait_q[sr].list.prev; if (ptr == &info->e_wait_q[sr].list) return NULL; return list_entry(ptr, struct ext_wait_queue, list); } static inline void set_cookie(struct sk_buff *skb, char code) { ((char *)skb->data)[NOTIFY_COOKIE_LEN-1] = code; } /* * The next function is only to split too long sys_mq_timedsend */ static void __do_notify(struct mqueue_inode_info *info) { /* notification * invoked when there is registered process and there isn't process * waiting synchronously for message AND state of queue changed from * empty to not empty. Here we are sure that no one is waiting * synchronously. */ if (info->notify_owner && info->attr.mq_curmsgs == 1) { switch (info->notify.sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: { struct kernel_siginfo sig_i; struct task_struct *task; /* do_mq_notify() accepts sigev_signo == 0, why?? */ if (!info->notify.sigev_signo) break; clear_siginfo(&sig_i); sig_i.si_signo = info->notify.sigev_signo; sig_i.si_errno = 0; sig_i.si_code = SI_MESGQ; sig_i.si_value = info->notify.sigev_value; rcu_read_lock(); /* map current pid/uid into info->owner's namespaces */ sig_i.si_pid = task_tgid_nr_ns(current, ns_of_pid(info->notify_owner)); sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid()); /* * We can't use kill_pid_info(), this signal should * bypass check_kill_permission(). It is from kernel * but si_fromuser() can't know this. * We do check the self_exec_id, to avoid sending * signals to programs that don't expect them. */ task = pid_task(info->notify_owner, PIDTYPE_TGID); if (task && task->self_exec_id == info->notify_self_exec_id) { do_send_sig_info(info->notify.sigev_signo, &sig_i, task, PIDTYPE_TGID); } rcu_read_unlock(); break; } case SIGEV_THREAD: set_cookie(info->notify_cookie, NOTIFY_WOKENUP); netlink_sendskb(info->notify_sock, info->notify_cookie); break; } /* after notification unregisters process */ put_pid(info->notify_owner); put_user_ns(info->notify_user_ns); info->notify_owner = NULL; info->notify_user_ns = NULL; } wake_up(&info->wait_q); } static int prepare_timeout(const struct __kernel_timespec __user *u_abs_timeout, struct timespec64 *ts) { if (get_timespec64(ts, u_abs_timeout)) return -EFAULT; if (!timespec64_valid(ts)) return -EINVAL; return 0; } static void remove_notification(struct mqueue_inode_info *info) { if (info->notify_owner != NULL && info->notify.sigev_notify == SIGEV_THREAD) { set_cookie(info->notify_cookie, NOTIFY_REMOVED); netlink_sendskb(info->notify_sock, info->notify_cookie); } put_pid(info->notify_owner); put_user_ns(info->notify_user_ns); info->notify_owner = NULL; info->notify_user_ns = NULL; } static int prepare_open(struct dentry *dentry, int oflag, int ro, umode_t mode, struct filename *name, struct mq_attr *attr) { static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE, MAY_READ | MAY_WRITE }; int acc; if (d_really_is_negative(dentry)) { if (!(oflag & O_CREAT)) return -ENOENT; if (ro) return ro; audit_inode_parent_hidden(name, dentry->d_parent); return vfs_mkobj(dentry, mode & ~current_umask(), mqueue_create_attr, attr); } /* it already existed */ audit_inode(name, dentry, 0); if ((oflag & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) return -EEXIST; if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) return -EINVAL; acc = oflag2acc[oflag & O_ACCMODE]; return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc); } static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, struct mq_attr *attr) { struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt; struct dentry *root = mnt->mnt_root; struct filename *name; struct path path; int fd, error; int ro; audit_mq_open(oflag, mode, attr); name = getname(u_name); if (IS_ERR(name)) return PTR_ERR(name); fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) goto out_putname; ro = mnt_want_write(mnt); /* we'll drop it in any case */ inode_lock(d_inode(root)); path.dentry = lookup_one_len(name->name, root, strlen(name->name)); if (IS_ERR(path.dentry)) { error = PTR_ERR(path.dentry); goto out_putfd; } path.mnt = mntget(mnt); error = prepare_open(path.dentry, oflag, ro, mode, name, attr); if (!error) { struct file *file = dentry_open(&path, oflag, current_cred()); if (!IS_ERR(file)) fd_install(fd, file); else error = PTR_ERR(file); } path_put(&path); out_putfd: if (error) { put_unused_fd(fd); fd = error; } inode_unlock(d_inode(root)); if (!ro) mnt_drop_write(mnt); out_putname: putname(name); return fd; } SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode, struct mq_attr __user *, u_attr) { struct mq_attr attr; if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr))) return -EFAULT; return do_mq_open(u_name, oflag, mode, u_attr ? &attr : NULL); } SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) { int err; struct filename *name; struct dentry *dentry; struct inode *inode = NULL; struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; struct vfsmount *mnt = ipc_ns->mq_mnt; name = getname(u_name); if (IS_ERR(name)) return PTR_ERR(name); audit_inode_parent_hidden(name, mnt->mnt_root); err = mnt_want_write(mnt); if (err) goto out_name; inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT); dentry = lookup_one_len(name->name, mnt->mnt_root, strlen(name->name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_unlock; } inode = d_inode(dentry); if (!inode) { err = -ENOENT; } else { ihold(inode); err = vfs_unlink(&nop_mnt_idmap, d_inode(dentry->d_parent), dentry, NULL); } dput(dentry); out_unlock: inode_unlock(d_inode(mnt->mnt_root)); iput(inode); mnt_drop_write(mnt); out_name: putname(name); return err; } /* Pipelined send and receive functions. * * If a receiver finds no waiting message, then it registers itself in the * list of waiting receivers. A sender checks that list before adding the new * message into the message array. If there is a waiting receiver, then it * bypasses the message array and directly hands the message over to the * receiver. The receiver accepts the message and returns without grabbing the * queue spinlock: * * - Set pointer to message. * - Queue the receiver task for later wakeup (without the info->lock). * - Update its state to STATE_READY. Now the receiver can continue. * - Wake up the process after the lock is dropped. Should the process wake up * before this wakeup (due to a timeout or a signal) it will either see * STATE_READY and continue or acquire the lock to check the state again. * * The same algorithm is used for senders. */ static inline void __pipelined_op(struct wake_q_head *wake_q, struct mqueue_inode_info *info, struct ext_wait_queue *this) { struct task_struct *task; list_del(&this->list); task = get_task_struct(this->task); /* see MQ_BARRIER for purpose/pairing */ smp_store_release(&this->state, STATE_READY); wake_q_add_safe(wake_q, task); } /* pipelined_send() - send a message directly to the task waiting in * sys_mq_timedreceive() (without inserting message into a queue). */ static inline void pipelined_send(struct wake_q_head *wake_q, struct mqueue_inode_info *info, struct msg_msg *message, struct ext_wait_queue *receiver) { receiver->msg = message; __pipelined_op(wake_q, info, receiver); } /* pipelined_receive() - if there is task waiting in sys_mq_timedsend() * gets its message and put to the queue (we have one free place for sure). */ static inline void pipelined_receive(struct wake_q_head *wake_q, struct mqueue_inode_info *info) { struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND); if (!sender) { /* for poll */ wake_up_interruptible(&info->wait_q); return; } if (msg_insert(sender->msg, info)) return; __pipelined_op(wake_q, info, sender); } static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, size_t msg_len, unsigned int msg_prio, struct timespec64 *ts) { struct fd f; struct inode *inode; struct ext_wait_queue wait; struct ext_wait_queue *receiver; struct msg_msg *msg_ptr; struct mqueue_inode_info *info; ktime_t expires, *timeout = NULL; struct posix_msg_tree_node *new_leaf = NULL; int ret = 0; DEFINE_WAKE_Q(wake_q); if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX)) return -EINVAL; if (ts) { expires = timespec64_to_ktime(*ts); timeout = &expires; } audit_mq_sendrecv(mqdes, msg_len, msg_prio, ts); f = fdget(mqdes); if (unlikely(!fd_file(f))) { ret = -EBADF; goto out; } inode = file_inode(fd_file(f)); if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) { ret = -EBADF; goto out_fput; } info = MQUEUE_I(inode); audit_file(fd_file(f)); if (unlikely(!(fd_file(f)->f_mode & FMODE_WRITE))) { ret = -EBADF; goto out_fput; } if (unlikely(msg_len > info->attr.mq_msgsize)) { ret = -EMSGSIZE; goto out_fput; } /* First try to allocate memory, before doing anything with * existing queues. */ msg_ptr = load_msg(u_msg_ptr, msg_len); if (IS_ERR(msg_ptr)) { ret = PTR_ERR(msg_ptr); goto out_fput; } msg_ptr->m_ts = msg_len; msg_ptr->m_type = msg_prio; /* * msg_insert really wants us to have a valid, spare node struct so * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will * fall back to that if necessary. */ if (!info->node_cache) new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL); spin_lock(&info->lock); if (!info->node_cache && new_leaf) { /* Save our speculative allocation into the cache */ INIT_LIST_HEAD(&new_leaf->msg_list); info->node_cache = new_leaf; new_leaf = NULL; } else { kfree(new_leaf); } if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) { if (fd_file(f)->f_flags & O_NONBLOCK) { ret = -EAGAIN; } else { wait.task = current; wait.msg = (void *) msg_ptr; /* memory barrier not required, we hold info->lock */ WRITE_ONCE(wait.state, STATE_NONE); ret = wq_sleep(info, SEND, timeout, &wait); /* * wq_sleep must be called with info->lock held, and * returns with the lock released */ goto out_free; } } else { receiver = wq_get_first_waiter(info, RECV); if (receiver) { pipelined_send(&wake_q, info, msg_ptr, receiver); } else { /* adds message to the queue */ ret = msg_insert(msg_ptr, info); if (ret) goto out_unlock; __do_notify(info); } simple_inode_init_ts(inode); } out_unlock: spin_unlock(&info->lock); wake_up_q(&wake_q); out_free: if (ret) free_msg(msg_ptr); out_fput: fdput(f); out: return ret; } static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, size_t msg_len, unsigned int __user *u_msg_prio, struct timespec64 *ts) { ssize_t ret; struct msg_msg *msg_ptr; struct fd f; struct inode *inode; struct mqueue_inode_info *info; struct ext_wait_queue wait; ktime_t expires, *timeout = NULL; struct posix_msg_tree_node *new_leaf = NULL; if (ts) { expires = timespec64_to_ktime(*ts); timeout = &expires; } audit_mq_sendrecv(mqdes, msg_len, 0, ts); f = fdget(mqdes); if (unlikely(!fd_file(f))) { ret = -EBADF; goto out; } inode = file_inode(fd_file(f)); if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) { ret = -EBADF; goto out_fput; } info = MQUEUE_I(inode); audit_file(fd_file(f)); if (unlikely(!(fd_file(f)->f_mode & FMODE_READ))) { ret = -EBADF; goto out_fput; } /* checks if buffer is big enough */ if (unlikely(msg_len < info->attr.mq_msgsize)) { ret = -EMSGSIZE; goto out_fput; } /* * msg_insert really wants us to have a valid, spare node struct so * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will * fall back to that if necessary. */ if (!info->node_cache) new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL); spin_lock(&info->lock); if (!info->node_cache && new_leaf) { /* Save our speculative allocation into the cache */ INIT_LIST_HEAD(&new_leaf->msg_list); info->node_cache = new_leaf; } else { kfree(new_leaf); } if (info->attr.mq_curmsgs == 0) { if (fd_file(f)->f_flags & O_NONBLOCK) { spin_unlock(&info->lock); ret = -EAGAIN; } else { wait.task = current; /* memory barrier not required, we hold info->lock */ WRITE_ONCE(wait.state, STATE_NONE); ret = wq_sleep(info, RECV, timeout, &wait); msg_ptr = wait.msg; } } else { DEFINE_WAKE_Q(wake_q); msg_ptr = msg_get(info); simple_inode_init_ts(inode); /* There is now free space in queue. */ pipelined_receive(&wake_q, info); spin_unlock(&info->lock); wake_up_q(&wake_q); ret = 0; } if (ret == 0) { ret = msg_ptr->m_ts; if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) || store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) { ret = -EFAULT; } free_msg(msg_ptr); } out_fput: fdput(f); out: return ret; } SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr, size_t, msg_len, unsigned int, msg_prio, const struct __kernel_timespec __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { int res = prepare_timeout(u_abs_timeout, &ts); if (res) return res; p = &ts; } return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p); } SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr, size_t, msg_len, unsigned int __user *, u_msg_prio, const struct __kernel_timespec __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { int res = prepare_timeout(u_abs_timeout, &ts); if (res) return res; p = &ts; } return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p); } /* * Notes: the case when user wants us to deregister (with NULL as pointer) * and he isn't currently owner of notification, will be silently discarded. * It isn't explicitly defined in the POSIX. */ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) { int ret; struct fd f; struct sock *sock; struct inode *inode; struct mqueue_inode_info *info; struct sk_buff *nc; audit_mq_notify(mqdes, notification); nc = NULL; sock = NULL; if (notification != NULL) { if (unlikely(notification->sigev_notify != SIGEV_NONE && notification->sigev_notify != SIGEV_SIGNAL && notification->sigev_notify != SIGEV_THREAD)) return -EINVAL; if (notification->sigev_notify == SIGEV_SIGNAL && !valid_signal(notification->sigev_signo)) { return -EINVAL; } if (notification->sigev_notify == SIGEV_THREAD) { long timeo; /* create the notify skb */ nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL); if (!nc) return -ENOMEM; if (copy_from_user(nc->data, notification->sigev_value.sival_ptr, NOTIFY_COOKIE_LEN)) { ret = -EFAULT; goto free_skb; } /* TODO: add a header? */ skb_put(nc, NOTIFY_COOKIE_LEN); /* and attach it to the socket */ retry: f = fdget(notification->sigev_signo); if (!fd_file(f)) { ret = -EBADF; goto out; } sock = netlink_getsockbyfilp(fd_file(f)); fdput(f); if (IS_ERR(sock)) { ret = PTR_ERR(sock); goto free_skb; } timeo = MAX_SCHEDULE_TIMEOUT; ret = netlink_attachskb(sock, nc, &timeo, NULL); if (ret == 1) { sock = NULL; goto retry; } if (ret) return ret; } } f = fdget(mqdes); if (!fd_file(f)) { ret = -EBADF; goto out; } inode = file_inode(fd_file(f)); if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) { ret = -EBADF; goto out_fput; } info = MQUEUE_I(inode); ret = 0; spin_lock(&info->lock); if (notification == NULL) { if (info->notify_owner == task_tgid(current)) { remove_notification(info); inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); } } else if (info->notify_owner != NULL) { ret = -EBUSY; } else { switch (notification->sigev_notify) { case SIGEV_NONE: info->notify.sigev_notify = SIGEV_NONE; break; case SIGEV_THREAD: info->notify_sock = sock; info->notify_cookie = nc; sock = NULL; nc = NULL; info->notify.sigev_notify = SIGEV_THREAD; break; case SIGEV_SIGNAL: info->notify.sigev_signo = notification->sigev_signo; info->notify.sigev_value = notification->sigev_value; info->notify.sigev_notify = SIGEV_SIGNAL; info->notify_self_exec_id = current->self_exec_id; break; } info->notify_owner = get_pid(task_tgid(current)); info->notify_user_ns = get_user_ns(current_user_ns()); inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); } spin_unlock(&info->lock); out_fput: fdput(f); out: if (sock) netlink_detachskb(sock, nc); else free_skb: dev_kfree_skb(nc); return ret; } SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes, const struct sigevent __user *, u_notification) { struct sigevent n, *p = NULL; if (u_notification) { if (copy_from_user(&n, u_notification, sizeof(struct sigevent))) return -EFAULT; p = &n; } return do_mq_notify(mqdes, p); } static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old) { struct fd f; struct inode *inode; struct mqueue_inode_info *info; if (new && (new->mq_flags & (~O_NONBLOCK))) return -EINVAL; f = fdget(mqdes); if (!fd_file(f)) return -EBADF; if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) { fdput(f); return -EBADF; } inode = file_inode(fd_file(f)); info = MQUEUE_I(inode); spin_lock(&info->lock); if (old) { *old = info->attr; old->mq_flags = fd_file(f)->f_flags & O_NONBLOCK; } if (new) { audit_mq_getsetattr(mqdes, new); spin_lock(&fd_file(f)->f_lock); if (new->mq_flags & O_NONBLOCK) fd_file(f)->f_flags |= O_NONBLOCK; else fd_file(f)->f_flags &= ~O_NONBLOCK; spin_unlock(&fd_file(f)->f_lock); inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); } spin_unlock(&info->lock); fdput(f); return 0; } SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes, const struct mq_attr __user *, u_mqstat, struct mq_attr __user *, u_omqstat) { int ret; struct mq_attr mqstat, omqstat; struct mq_attr *new = NULL, *old = NULL; if (u_mqstat) { new = &mqstat; if (copy_from_user(new, u_mqstat, sizeof(struct mq_attr))) return -EFAULT; } if (u_omqstat) old = &omqstat; ret = do_mq_getsetattr(mqdes, new, old); if (ret || !old) return ret; if (copy_to_user(u_omqstat, old, sizeof(struct mq_attr))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT struct compat_mq_attr { compat_long_t mq_flags; /* message queue flags */ compat_long_t mq_maxmsg; /* maximum number of messages */ compat_long_t mq_msgsize; /* maximum message size */ compat_long_t mq_curmsgs; /* number of messages currently queued */ compat_long_t __reserved[4]; /* ignored for input, zeroed for output */ }; static inline int get_compat_mq_attr(struct mq_attr *attr, const struct compat_mq_attr __user *uattr) { struct compat_mq_attr v; if (copy_from_user(&v, uattr, sizeof(*uattr))) return -EFAULT; memset(attr, 0, sizeof(*attr)); attr->mq_flags = v.mq_flags; attr->mq_maxmsg = v.mq_maxmsg; attr->mq_msgsize = v.mq_msgsize; attr->mq_curmsgs = v.mq_curmsgs; return 0; } static inline int put_compat_mq_attr(const struct mq_attr *attr, struct compat_mq_attr __user *uattr) { struct compat_mq_attr v; memset(&v, 0, sizeof(v)); v.mq_flags = attr->mq_flags; v.mq_maxmsg = attr->mq_maxmsg; v.mq_msgsize = attr->mq_msgsize; v.mq_curmsgs = attr->mq_curmsgs; if (copy_to_user(uattr, &v, sizeof(*uattr))) return -EFAULT; return 0; } COMPAT_SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, compat_mode_t, mode, struct compat_mq_attr __user *, u_attr) { struct mq_attr attr, *p = NULL; if (u_attr && oflag & O_CREAT) { p = &attr; if (get_compat_mq_attr(&attr, u_attr)) return -EFAULT; } return do_mq_open(u_name, oflag, mode, p); } COMPAT_SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes, const struct compat_sigevent __user *, u_notification) { struct sigevent n, *p = NULL; if (u_notification) { if (get_compat_sigevent(&n, u_notification)) return -EFAULT; if (n.sigev_notify == SIGEV_THREAD) n.sigev_value.sival_ptr = compat_ptr(n.sigev_value.sival_int); p = &n; } return do_mq_notify(mqdes, p); } COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes, const struct compat_mq_attr __user *, u_mqstat, struct compat_mq_attr __user *, u_omqstat) { int ret; struct mq_attr mqstat, omqstat; struct mq_attr *new = NULL, *old = NULL; if (u_mqstat) { new = &mqstat; if (get_compat_mq_attr(new, u_mqstat)) return -EFAULT; } if (u_omqstat) old = &omqstat; ret = do_mq_getsetattr(mqdes, new, old); if (ret || !old) return ret; if (put_compat_mq_attr(old, u_omqstat)) return -EFAULT; return 0; } #endif #ifdef CONFIG_COMPAT_32BIT_TIME static int compat_prepare_timeout(const struct old_timespec32 __user *p, struct timespec64 *ts) { if (get_old_timespec32(ts, p)) return -EFAULT; if (!timespec64_valid(ts)) return -EINVAL; return 0; } SYSCALL_DEFINE5(mq_timedsend_time32, mqd_t, mqdes, const char __user *, u_msg_ptr, unsigned int, msg_len, unsigned int, msg_prio, const struct old_timespec32 __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { int res = compat_prepare_timeout(u_abs_timeout, &ts); if (res) return res; p = &ts; } return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p); } SYSCALL_DEFINE5(mq_timedreceive_time32, mqd_t, mqdes, char __user *, u_msg_ptr, unsigned int, msg_len, unsigned int __user *, u_msg_prio, const struct old_timespec32 __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { int res = compat_prepare_timeout(u_abs_timeout, &ts); if (res) return res; p = &ts; } return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p); } #endif static const struct inode_operations mqueue_dir_inode_operations = { .lookup = simple_lookup, .create = mqueue_create, .unlink = mqueue_unlink, }; static const struct file_operations mqueue_file_operations = { .flush = mqueue_flush_file, .poll = mqueue_poll_file, .read = mqueue_read_file, .llseek = default_llseek, }; static const struct super_operations mqueue_super_ops = { .alloc_inode = mqueue_alloc_inode, .free_inode = mqueue_free_inode, .evict_inode = mqueue_evict_inode, .statfs = simple_statfs, }; static const struct fs_context_operations mqueue_fs_context_ops = { .free = mqueue_fs_context_free, .get_tree = mqueue_get_tree, }; static struct file_system_type mqueue_fs_type = { .name = "mqueue", .init_fs_context = mqueue_init_fs_context, .kill_sb = kill_litter_super, .fs_flags = FS_USERNS_MOUNT, }; int mq_init_ns(struct ipc_namespace *ns) { struct vfsmount *m; ns->mq_queues_count = 0; ns->mq_queues_max = DFLT_QUEUESMAX; ns->mq_msg_max = DFLT_MSGMAX; ns->mq_msgsize_max = DFLT_MSGSIZEMAX; ns->mq_msg_default = DFLT_MSG; ns->mq_msgsize_default = DFLT_MSGSIZE; m = mq_create_mount(ns); if (IS_ERR(m)) return PTR_ERR(m); ns->mq_mnt = m; return 0; } void mq_clear_sbinfo(struct ipc_namespace *ns) { ns->mq_mnt->mnt_sb->s_fs_info = NULL; } static int __init init_mqueue_fs(void) { int error; mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache", sizeof(struct mqueue_inode_info), 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once); if (mqueue_inode_cachep == NULL) return -ENOMEM; if (!setup_mq_sysctls(&init_ipc_ns)) { pr_warn("sysctl registration failed\n"); error = -ENOMEM; goto out_kmem; } error = register_filesystem(&mqueue_fs_type); if (error) goto out_sysctl; spin_lock_init(&mq_lock); error = mq_init_ns(&init_ipc_ns); if (error) goto out_filesystem; return 0; out_filesystem: unregister_filesystem(&mqueue_fs_type); out_sysctl: retire_mq_sysctls(&init_ipc_ns); out_kmem: kmem_cache_destroy(mqueue_inode_cachep); return error; } device_initcall(init_mqueue_fs); |
9 5 4 6 2 2 4 1 1 2 1 6 2 2 2 2 2 2 3 1 2 1 1 1 1 16 7 7 9 1 9 2 5 2 4 1 1 2 3 169 168 44 140 3 2 1 1 1 5 4 1 1 2 1 2 2 1 1 4 4 1 2 1 15 11 11 2 1 1 1 11 2 3 3 13 1 13 5 9 3 1 2 2 2 1 2 3 3 5 5 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 4 1 1 1 1 2 1 1 7 1 2 1 1 1 1 2 1 1 10 1 1 5 2 2 2 1 1 4 3 3 2 1 1 3 1 1 1 1 1 9 1 9 1 1 7 7 4 1 1 6 1 5 2 2 2 2 2 1 1 1 2 2 1 1 1 1 4 5 5 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 1 1 1 1 1 2 1 1 3 1 1 1 10 82 83 114 50 163 24 2 4 14 2 1 1 2 1 1 1 1 2 1 1 2 4 2 6 2 3 1 9 1 6 2 1 4 1 1 5 1 3 1 1 1 1 2 4 2 10 1 1 1 1 1 1 1 1 2 4 3 5 1 1 2 3 165 156 8 114 51 166 1 158 1 5 166 91 53 1 4 13 1 161 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/ethtool.c - Ethtool ioctl handler * Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx> * * This file is where we call all the ethtool_ops commands to get * the information ethtool needs. */ #include <linux/compat.h> #include <linux/etherdevice.h> #include <linux/module.h> #include <linux/types.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/ethtool.h> #include <linux/netdevice.h> #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/bitops.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/sfp.h> #include <linux/slab.h> #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/net.h> #include <linux/pm_runtime.h> #include <linux/utsname.h> #include <net/devlink.h> #include <net/ipv6.h> #include <net/xdp_sock_drv.h> #include <net/flow_offload.h> #include <linux/ethtool_netlink.h> #include "common.h" /* State held across locks and calls for commands which have devlink fallback */ struct ethtool_devlink_compat { struct devlink *devlink; union { struct ethtool_flash efl; struct ethtool_drvinfo info; }; }; static struct devlink *netdev_to_devlink_get(struct net_device *dev) { if (!dev->devlink_port) return NULL; return devlink_try_get(dev->devlink_port->devlink); } /* * Some useful ethtool_ops methods that're device independent. * If we find that all drivers want to do the same thing here, * we can turn these into dev_() function calls. */ u32 ethtool_op_get_link(struct net_device *dev) { /* Synchronize carrier state with link watch, see also rtnl_getlink() */ linkwatch_sync_dev(dev); return netif_carrier_ok(dev) ? 1 : 0; } EXPORT_SYMBOL(ethtool_op_get_link); int ethtool_op_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) { info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE | SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; info->phc_index = -1; return 0; } EXPORT_SYMBOL(ethtool_op_get_ts_info); /* Handlers for each ethtool command */ static int ethtool_get_features(struct net_device *dev, void __user *useraddr) { struct ethtool_gfeatures cmd = { .cmd = ETHTOOL_GFEATURES, .size = ETHTOOL_DEV_FEATURE_WORDS, }; struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; u32 __user *sizeaddr; u32 copy_size; int i; /* in case feature bits run out again */ BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t)); for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { features[i].available = (u32)(dev->hw_features >> (32 * i)); features[i].requested = (u32)(dev->wanted_features >> (32 * i)); features[i].active = (u32)(dev->features >> (32 * i)); features[i].never_changed = (u32)(NETIF_F_NEVER_CHANGE >> (32 * i)); } sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size); if (get_user(copy_size, sizeaddr)) return -EFAULT; if (copy_size > ETHTOOL_DEV_FEATURE_WORDS) copy_size = ETHTOOL_DEV_FEATURE_WORDS; if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; useraddr += sizeof(cmd); if (copy_to_user(useraddr, features, array_size(copy_size, sizeof(*features)))) return -EFAULT; return 0; } static int ethtool_set_features(struct net_device *dev, void __user *useraddr) { struct ethtool_sfeatures cmd; struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS]; netdev_features_t wanted = 0, valid = 0; int i, ret = 0; if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; useraddr += sizeof(cmd); if (cmd.size != ETHTOOL_DEV_FEATURE_WORDS) return -EINVAL; if (copy_from_user(features, useraddr, sizeof(features))) return -EFAULT; for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) { valid |= (netdev_features_t)features[i].valid << (32 * i); wanted |= (netdev_features_t)features[i].requested << (32 * i); } if (valid & ~NETIF_F_ETHTOOL_BITS) return -EINVAL; if (valid & ~dev->hw_features) { valid &= dev->hw_features; ret |= ETHTOOL_F_UNSUPPORTED; } dev->wanted_features &= ~valid; dev->wanted_features |= wanted & valid; __netdev_update_features(dev); if ((dev->wanted_features ^ dev->features) & valid) ret |= ETHTOOL_F_WISH; return ret; } static int __ethtool_get_sset_count(struct net_device *dev, int sset) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; if (sset == ETH_SS_FEATURES) return ARRAY_SIZE(netdev_features_strings); if (sset == ETH_SS_RSS_HASH_FUNCS) return ARRAY_SIZE(rss_hash_func_strings); if (sset == ETH_SS_TUNABLES) return ARRAY_SIZE(tunable_strings); if (sset == ETH_SS_PHY_TUNABLES) return ARRAY_SIZE(phy_tunable_strings); if (sset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_sset_count) return phy_ops->get_sset_count(dev->phydev); if (sset == ETH_SS_LINK_MODES) return __ETHTOOL_LINK_MODE_MASK_NBITS; if (ops->get_sset_count && ops->get_strings) return ops->get_sset_count(dev, sset); else return -EOPNOTSUPP; } static void __ethtool_get_strings(struct net_device *dev, u32 stringset, u8 *data) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; const struct ethtool_ops *ops = dev->ethtool_ops; if (stringset == ETH_SS_FEATURES) memcpy(data, netdev_features_strings, sizeof(netdev_features_strings)); else if (stringset == ETH_SS_RSS_HASH_FUNCS) memcpy(data, rss_hash_func_strings, sizeof(rss_hash_func_strings)); else if (stringset == ETH_SS_TUNABLES) memcpy(data, tunable_strings, sizeof(tunable_strings)); else if (stringset == ETH_SS_PHY_TUNABLES) memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings)); else if (stringset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats && phy_ops && phy_ops->get_strings) phy_ops->get_strings(dev->phydev, data); else if (stringset == ETH_SS_LINK_MODES) memcpy(data, link_mode_names, __ETHTOOL_LINK_MODE_MASK_NBITS * ETH_GSTRING_LEN); else /* ops->get_strings is valid because checked earlier */ ops->get_strings(dev, stringset, data); } static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd) { /* feature masks of legacy discrete ethtool ops */ switch (eth_cmd) { case ETHTOOL_GTXCSUM: case ETHTOOL_STXCSUM: return NETIF_F_CSUM_MASK | NETIF_F_FCOE_CRC | NETIF_F_SCTP_CRC; case ETHTOOL_GRXCSUM: case ETHTOOL_SRXCSUM: return NETIF_F_RXCSUM; case ETHTOOL_GSG: case ETHTOOL_SSG: return NETIF_F_SG | NETIF_F_FRAGLIST; case ETHTOOL_GTSO: case ETHTOOL_STSO: return NETIF_F_ALL_TSO; case ETHTOOL_GGSO: case ETHTOOL_SGSO: return NETIF_F_GSO; case ETHTOOL_GGRO: case ETHTOOL_SGRO: return NETIF_F_GRO; default: BUG(); } } static int ethtool_get_one_feature(struct net_device *dev, char __user *useraddr, u32 ethcmd) { netdev_features_t mask = ethtool_get_feature_mask(ethcmd); struct ethtool_value edata = { .cmd = ethcmd, .data = !!(dev->features & mask), }; if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_set_one_feature(struct net_device *dev, void __user *useraddr, u32 ethcmd) { struct ethtool_value edata; netdev_features_t mask; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; mask = ethtool_get_feature_mask(ethcmd); mask &= dev->hw_features; if (!mask) return -EOPNOTSUPP; if (edata.data) dev->wanted_features |= mask; else dev->wanted_features &= ~mask; __netdev_update_features(dev); return 0; } #define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH) #define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \ NETIF_F_RXHASH) static u32 __ethtool_get_flags(struct net_device *dev) { u32 flags = 0; if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO; if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) flags |= ETH_FLAG_RXVLAN; if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) flags |= ETH_FLAG_TXVLAN; if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE; if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH; return flags; } static int __ethtool_set_flags(struct net_device *dev, u32 data) { netdev_features_t features = 0, changed; if (data & ~ETH_ALL_FLAGS) return -EINVAL; if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO; if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_CTAG_RX; if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_CTAG_TX; if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE; if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH; /* allow changing only bits set in hw_features */ changed = (features ^ dev->features) & ETH_ALL_FEATURES; if (changed & ~dev->hw_features) return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP; dev->wanted_features = (dev->wanted_features & ~changed) | (features & changed); __netdev_update_features(dev); return 0; } /* Given two link masks, AND them together and save the result in dst. */ void ethtool_intersect_link_masks(struct ethtool_link_ksettings *dst, struct ethtool_link_ksettings *src) { unsigned int size = BITS_TO_LONGS(__ETHTOOL_LINK_MODE_MASK_NBITS); unsigned int idx = 0; for (; idx < size; idx++) { dst->link_modes.supported[idx] &= src->link_modes.supported[idx]; dst->link_modes.advertising[idx] &= src->link_modes.advertising[idx]; } } EXPORT_SYMBOL(ethtool_intersect_link_masks); void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32) { linkmode_zero(dst); dst[0] = legacy_u32; } EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode); /* return false if src had higher bits set. lower bits always updated. */ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, const unsigned long *src) { *legacy_u32 = src[0]; return find_next_bit(src, __ETHTOOL_LINK_MODE_MASK_NBITS, 32) == __ETHTOOL_LINK_MODE_MASK_NBITS; } EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32); /* return false if ksettings link modes had higher bits * set. legacy_settings always updated (best effort) */ static bool convert_link_ksettings_to_legacy_settings( struct ethtool_cmd *legacy_settings, const struct ethtool_link_ksettings *link_ksettings) { bool retval = true; memset(legacy_settings, 0, sizeof(*legacy_settings)); /* this also clears the deprecated fields in legacy structure: * __u8 transceiver; * __u32 maxtxpkt; * __u32 maxrxpkt; */ retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->supported, link_ksettings->link_modes.supported); retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->advertising, link_ksettings->link_modes.advertising); retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->lp_advertising, link_ksettings->link_modes.lp_advertising); ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed); legacy_settings->duplex = link_ksettings->base.duplex; legacy_settings->port = link_ksettings->base.port; legacy_settings->phy_address = link_ksettings->base.phy_address; legacy_settings->autoneg = link_ksettings->base.autoneg; legacy_settings->mdio_support = link_ksettings->base.mdio_support; legacy_settings->eth_tp_mdix = link_ksettings->base.eth_tp_mdix; legacy_settings->eth_tp_mdix_ctrl = link_ksettings->base.eth_tp_mdix_ctrl; legacy_settings->transceiver = link_ksettings->base.transceiver; return retval; } /* number of 32-bit words to store the user's link mode bitmaps */ #define __ETHTOOL_LINK_MODE_MASK_NU32 \ DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32) /* layout of the struct passed from/to userland */ struct ethtool_link_usettings { struct ethtool_link_settings base; struct { __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32]; __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32]; } link_modes; }; /* Internal kernel helper to query a device ethtool_link_settings. */ int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings) { ASSERT_RTNL(); if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; memset(link_ksettings, 0, sizeof(*link_ksettings)); return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); } EXPORT_SYMBOL(__ethtool_get_link_ksettings); /* convert ethtool_link_usettings in user space to a kernel internal * ethtool_link_ksettings. return 0 on success, errno on error. */ static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to, const void __user *from) { struct ethtool_link_usettings link_usettings; if (copy_from_user(&link_usettings, from, sizeof(link_usettings))) return -EFAULT; memcpy(&to->base, &link_usettings.base, sizeof(to->base)); bitmap_from_arr32(to->link_modes.supported, link_usettings.link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_from_arr32(to->link_modes.advertising, link_usettings.link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_from_arr32(to->link_modes.lp_advertising, link_usettings.link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); return 0; } /* Check if the user is trying to change anything besides speed/duplex */ bool ethtool_virtdev_validate_cmd(const struct ethtool_link_ksettings *cmd) { struct ethtool_link_settings base2 = {}; base2.speed = cmd->base.speed; base2.port = PORT_OTHER; base2.duplex = cmd->base.duplex; base2.cmd = cmd->base.cmd; base2.link_mode_masks_nwords = cmd->base.link_mode_masks_nwords; return !memcmp(&base2, &cmd->base, sizeof(base2)) && bitmap_empty(cmd->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS) && bitmap_empty(cmd->link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); } /* convert a kernel internal ethtool_link_ksettings to * ethtool_link_usettings in user space. return 0 on success, errno on * error. */ static int store_link_ksettings_for_user(void __user *to, const struct ethtool_link_ksettings *from) { struct ethtool_link_usettings link_usettings; memcpy(&link_usettings, from, sizeof(link_usettings)); bitmap_to_arr32(link_usettings.link_modes.supported, from->link_modes.supported, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_to_arr32(link_usettings.link_modes.advertising, from->link_modes.advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); bitmap_to_arr32(link_usettings.link_modes.lp_advertising, from->link_modes.lp_advertising, __ETHTOOL_LINK_MODE_MASK_NBITS); if (copy_to_user(to, &link_usettings, sizeof(link_usettings))) return -EFAULT; return 0; } /* Query device for its ethtool_link_settings. */ static int ethtool_get_link_ksettings(struct net_device *dev, void __user *useraddr) { int err = 0; struct ethtool_link_ksettings link_ksettings; ASSERT_RTNL(); if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; /* handle bitmap nbits handshake */ if (copy_from_user(&link_ksettings.base, useraddr, sizeof(link_ksettings.base))) return -EFAULT; if (__ETHTOOL_LINK_MODE_MASK_NU32 != link_ksettings.base.link_mode_masks_nwords) { /* wrong link mode nbits requested */ memset(&link_ksettings, 0, sizeof(link_ksettings)); link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; /* send back number of words required as negative val */ compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX, "need too many bits for link modes!"); link_ksettings.base.link_mode_masks_nwords = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32); /* copy the base fields back to user, not the link * mode bitmaps */ if (copy_to_user(useraddr, &link_ksettings.base, sizeof(link_ksettings.base))) return -EFAULT; return 0; } /* handshake successful: user/kernel agree on * link_mode_masks_nwords */ memset(&link_ksettings, 0, sizeof(link_ksettings)); err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); if (err < 0) return err; /* make sure we tell the right values to user */ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; link_ksettings.base.master_slave_cfg = MASTER_SLAVE_CFG_UNSUPPORTED; link_ksettings.base.master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; link_ksettings.base.rate_matching = RATE_MATCH_NONE; return store_link_ksettings_for_user(useraddr, &link_ksettings); } /* Update device ethtool_link_settings. */ static int ethtool_set_link_ksettings(struct net_device *dev, void __user *useraddr) { struct ethtool_link_ksettings link_ksettings = {}; int err; ASSERT_RTNL(); if (!dev->ethtool_ops->set_link_ksettings) return -EOPNOTSUPP; /* make sure nbits field has expected value */ if (copy_from_user(&link_ksettings.base, useraddr, sizeof(link_ksettings.base))) return -EFAULT; if (__ETHTOOL_LINK_MODE_MASK_NU32 != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; /* copy the whole structure, now that we know it has expected * format */ err = load_link_ksettings_from_user(&link_ksettings, useraddr); if (err) return err; /* re-check nwords field, just in case */ if (__ETHTOOL_LINK_MODE_MASK_NU32 != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; if (link_ksettings.base.master_slave_cfg || link_ksettings.base.master_slave_state) return -EINVAL; err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); if (err >= 0) { ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); } return err; } int ethtool_virtdev_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd, u32 *dev_speed, u8 *dev_duplex) { u32 speed; u8 duplex; speed = cmd->base.speed; duplex = cmd->base.duplex; /* don't allow custom speed and duplex */ if (!ethtool_validate_speed(speed) || !ethtool_validate_duplex(duplex) || !ethtool_virtdev_validate_cmd(cmd)) return -EINVAL; *dev_speed = speed; *dev_duplex = duplex; return 0; } EXPORT_SYMBOL(ethtool_virtdev_set_link_ksettings); /* Query device for its ethtool_cmd settings. * * Backward compatibility note: for compatibility with legacy ethtool, this is * now implemented via get_link_ksettings. When driver reports higher link mode * bits, a kernel warning is logged once (with name of 1st driver/device) to * recommend user to upgrade ethtool, but the command is successful (only the * lower link mode bits reported back to user). Deprecated fields from * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero. */ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; int err; ASSERT_RTNL(); if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; if (dev->ethtool->module_fw_flash_in_progress) return -EBUSY; memset(&link_ksettings, 0, sizeof(link_ksettings)); err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); if (err < 0) return err; convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings); /* send a sensible cmd tag back to user */ cmd.cmd = ETHTOOL_GSET; if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; return 0; } /* Update device link settings with given ethtool_cmd. * * Backward compatibility note: for compatibility with legacy ethtool, this is * now always implemented via set_link_settings. When user's request updates * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel * warning is logged once (with name of 1st driver/device) to recommend user to * upgrade ethtool, and the request is rejected. */ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; int ret; ASSERT_RTNL(); if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; if (!dev->ethtool_ops->set_link_ksettings) return -EOPNOTSUPP; if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd)) return -EINVAL; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; ret = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); if (ret >= 0) { ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF, NULL); } return ret; } static int ethtool_get_drvinfo(struct net_device *dev, struct ethtool_devlink_compat *rsp) { const struct ethtool_ops *ops = dev->ethtool_ops; struct device *parent = dev->dev.parent; rsp->info.cmd = ETHTOOL_GDRVINFO; strscpy(rsp->info.version, init_uts_ns.name.release, sizeof(rsp->info.version)); if (ops->get_drvinfo) { ops->get_drvinfo(dev, &rsp->info); if (!rsp->info.bus_info[0] && parent) strscpy(rsp->info.bus_info, dev_name(parent), sizeof(rsp->info.bus_info)); if (!rsp->info.driver[0] && parent && parent->driver) strscpy(rsp->info.driver, parent->driver->name, sizeof(rsp->info.driver)); } else if (parent && parent->driver) { strscpy(rsp->info.bus_info, dev_name(parent), sizeof(rsp->info.bus_info)); strscpy(rsp->info.driver, parent->driver->name, sizeof(rsp->info.driver)); } else if (dev->rtnl_link_ops) { strscpy(rsp->info.driver, dev->rtnl_link_ops->kind, sizeof(rsp->info.driver)); } else { return -EOPNOTSUPP; } /* * this method of obtaining string set info is deprecated; * Use ETHTOOL_GSSET_INFO instead. */ if (ops->get_sset_count) { int rc; rc = ops->get_sset_count(dev, ETH_SS_TEST); if (rc >= 0) rsp->info.testinfo_len = rc; rc = ops->get_sset_count(dev, ETH_SS_STATS); if (rc >= 0) rsp->info.n_stats = rc; rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (rc >= 0) rsp->info.n_priv_flags = rc; } if (ops->get_regs_len) { int ret = ops->get_regs_len(dev); if (ret > 0) rsp->info.regdump_len = ret; } if (ops->get_eeprom_len) rsp->info.eedump_len = ops->get_eeprom_len(dev); if (!rsp->info.fw_version[0]) rsp->devlink = netdev_to_devlink_get(dev); return 0; } static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, void __user *useraddr) { struct ethtool_sset_info info; u64 sset_mask; int i, idx = 0, n_bits = 0, ret, rc; u32 *info_buf = NULL; if (copy_from_user(&info, useraddr, sizeof(info))) return -EFAULT; /* store copy of mask, because we zero struct later on */ sset_mask = info.sset_mask; if (!sset_mask) return 0; /* calculate size of return buffer */ n_bits = hweight64(sset_mask); memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GSSET_INFO; info_buf = kcalloc(n_bits, sizeof(u32), GFP_USER); if (!info_buf) return -ENOMEM; /* * fill return buffer based on input bitmask and successful * get_sset_count return */ for (i = 0; i < 64; i++) { if (!(sset_mask & (1ULL << i))) continue; rc = __ethtool_get_sset_count(dev, i); if (rc >= 0) { info.sset_mask |= (1ULL << i); info_buf[idx++] = rc; } } ret = -EFAULT; if (copy_to_user(useraddr, &info, sizeof(info))) goto out; useraddr += offsetof(struct ethtool_sset_info, data); if (copy_to_user(useraddr, info_buf, array_size(idx, sizeof(u32)))) goto out; ret = 0; out: kfree(info_buf); return ret; } static noinline_for_stack int ethtool_rxnfc_copy_from_compat(struct ethtool_rxnfc *rxnfc, const struct compat_ethtool_rxnfc __user *useraddr, size_t size) { struct compat_ethtool_rxnfc crxnfc = {}; /* We expect there to be holes between fs.m_ext and * fs.ring_cookie and at the end of fs, but nowhere else. * On non-x86, no conversion should be needed. */ BUILD_BUG_ON(!IS_ENABLED(CONFIG_X86_64) && sizeof(struct compat_ethtool_rxnfc) != sizeof(struct ethtool_rxnfc)); BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) + sizeof(useraddr->fs.m_ext) != offsetof(struct ethtool_rxnfc, fs.m_ext) + sizeof(rxnfc->fs.m_ext)); BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.location) - offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != offsetof(struct ethtool_rxnfc, fs.location) - offsetof(struct ethtool_rxnfc, fs.ring_cookie)); if (copy_from_user(&crxnfc, useraddr, min(size, sizeof(crxnfc)))) return -EFAULT; *rxnfc = (struct ethtool_rxnfc) { .cmd = crxnfc.cmd, .flow_type = crxnfc.flow_type, .data = crxnfc.data, .fs = { .flow_type = crxnfc.fs.flow_type, .h_u = crxnfc.fs.h_u, .h_ext = crxnfc.fs.h_ext, .m_u = crxnfc.fs.m_u, .m_ext = crxnfc.fs.m_ext, .ring_cookie = crxnfc.fs.ring_cookie, .location = crxnfc.fs.location, }, .rule_cnt = crxnfc.rule_cnt, }; return 0; } static int ethtool_rxnfc_copy_from_user(struct ethtool_rxnfc *rxnfc, const void __user *useraddr, size_t size) { if (compat_need_64bit_alignment_fixup()) return ethtool_rxnfc_copy_from_compat(rxnfc, useraddr, size); if (copy_from_user(rxnfc, useraddr, size)) return -EFAULT; return 0; } static int ethtool_rxnfc_copy_to_compat(void __user *useraddr, const struct ethtool_rxnfc *rxnfc, size_t size, const u32 *rule_buf) { struct compat_ethtool_rxnfc crxnfc; memset(&crxnfc, 0, sizeof(crxnfc)); crxnfc = (struct compat_ethtool_rxnfc) { .cmd = rxnfc->cmd, .flow_type = rxnfc->flow_type, .data = rxnfc->data, .fs = { .flow_type = rxnfc->fs.flow_type, .h_u = rxnfc->fs.h_u, .h_ext = rxnfc->fs.h_ext, .m_u = rxnfc->fs.m_u, .m_ext = rxnfc->fs.m_ext, .ring_cookie = rxnfc->fs.ring_cookie, .location = rxnfc->fs.location, }, .rule_cnt = rxnfc->rule_cnt, }; if (copy_to_user(useraddr, &crxnfc, min(size, sizeof(crxnfc)))) return -EFAULT; return 0; } static int ethtool_rxnfc_copy_struct(u32 cmd, struct ethtool_rxnfc *info, size_t *info_size, void __user *useraddr) { /* struct ethtool_rxnfc was originally defined for * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data * members. User-space might still be using that * definition. */ if (cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) *info_size = (offsetof(struct ethtool_rxnfc, data) + sizeof(info->data)); if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size)) return -EFAULT; if ((cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) && info->flow_type & FLOW_RSS) { *info_size = sizeof(*info); if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size)) return -EFAULT; /* Since malicious users may modify the original data, * we need to check whether FLOW_RSS is still requested. */ if (!(info->flow_type & FLOW_RSS)) return -EINVAL; } if (info->cmd != cmd) return -EINVAL; return 0; } static int ethtool_rxnfc_copy_to_user(void __user *useraddr, const struct ethtool_rxnfc *rxnfc, size_t size, const u32 *rule_buf) { int ret; if (compat_need_64bit_alignment_fixup()) { ret = ethtool_rxnfc_copy_to_compat(useraddr, rxnfc, size, rule_buf); useraddr += offsetof(struct compat_ethtool_rxnfc, rule_locs); } else { ret = copy_to_user(useraddr, rxnfc, size); useraddr += offsetof(struct ethtool_rxnfc, rule_locs); } if (ret) return -EFAULT; if (rule_buf) { if (copy_to_user(useraddr, rule_buf, rxnfc->rule_cnt * sizeof(u32))) return -EFAULT; } return 0; } static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxnfc info; size_t info_size = sizeof(info); int rc; if (!ops->set_rxnfc) return -EOPNOTSUPP; rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (rc) return rc; if (ops->get_rxfh) { struct ethtool_rxfh_param rxfh = {}; rc = ops->get_rxfh(dev, &rxfh); if (rc) return rc; /* Sanity check: if symmetric-xor is set, then: * 1 - no other fields besides IP src/dst and/or L4 src/dst * 2 - If src is set, dst must also be set */ if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && ((info.data & ~(RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3)) || (!!(info.data & RXH_IP_SRC) ^ !!(info.data & RXH_IP_DST)) || (!!(info.data & RXH_L4_B_0_1) ^ !!(info.data & RXH_L4_B_2_3)))) return -EINVAL; } rc = ops->set_rxnfc(dev, &info); if (rc) return rc; if (cmd == ETHTOOL_SRXCLSRLINS && ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL)) return -EFAULT; return 0; } static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { struct ethtool_rxnfc info; size_t info_size = sizeof(info); const struct ethtool_ops *ops = dev->ethtool_ops; int ret; void *rule_buf = NULL; if (!ops->get_rxnfc) return -EOPNOTSUPP; ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (ret) return ret; if (info.cmd == ETHTOOL_GRXCLSRLALL) { if (info.rule_cnt > 0) { if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32)) rule_buf = kcalloc(info.rule_cnt, sizeof(u32), GFP_USER); if (!rule_buf) return -ENOMEM; } } ret = ops->get_rxnfc(dev, &info, rule_buf); if (ret < 0) goto err_out; ret = ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); err_out: kfree(rule_buf); return ret; } static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, struct ethtool_rxnfc *rx_rings, u32 size) { int i; if (copy_from_user(indir, useraddr, array_size(size, sizeof(indir[0])))) return -EFAULT; /* Validate ring indices */ for (i = 0; i < size; i++) if (indir[i] >= rx_rings->data) return -EINVAL; return 0; } u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly; void netdev_rss_key_fill(void *buffer, size_t len) { BUG_ON(len > sizeof(netdev_rss_key)); net_get_random_once(netdev_rss_key, sizeof(netdev_rss_key)); memcpy(buffer, netdev_rss_key, len); } EXPORT_SYMBOL(netdev_rss_key_fill); static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev, void __user *useraddr) { struct ethtool_rxfh_param rxfh = {}; u32 user_size; int ret; if (!dev->ethtool_ops->get_rxfh_indir_size || !dev->ethtool_ops->get_rxfh) return -EOPNOTSUPP; rxfh.indir_size = dev->ethtool_ops->get_rxfh_indir_size(dev); if (rxfh.indir_size == 0) return -EOPNOTSUPP; if (copy_from_user(&user_size, useraddr + offsetof(struct ethtool_rxfh_indir, size), sizeof(user_size))) return -EFAULT; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size), &rxfh.indir_size, sizeof(rxfh.indir_size))) return -EFAULT; /* If the user buffer size is 0, this is just a query for the * device table size. Otherwise, if it's smaller than the * device table size it's an error. */ if (user_size < rxfh.indir_size) return user_size == 0 ? 0 : -EINVAL; rxfh.indir = kcalloc(rxfh.indir_size, sizeof(rxfh.indir[0]), GFP_USER); if (!rxfh.indir) return -ENOMEM; ret = dev->ethtool_ops->get_rxfh(dev, &rxfh); if (ret) goto out; if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, ring_index[0]), rxfh.indir, rxfh.indir_size * sizeof(*rxfh.indir))) ret = -EFAULT; out: kfree(rxfh.indir); return ret; } static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxfh_param rxfh_dev = {}; struct netlink_ext_ack *extack = NULL; struct ethtool_rxnfc rx_rings; u32 user_size, i; int ret; u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); if (!ops->get_rxfh_indir_size || !ops->set_rxfh || !ops->get_rxnfc) return -EOPNOTSUPP; rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev); if (rxfh_dev.indir_size == 0) return -EOPNOTSUPP; if (copy_from_user(&user_size, useraddr + offsetof(struct ethtool_rxfh_indir, size), sizeof(user_size))) return -EFAULT; if (user_size != 0 && user_size != rxfh_dev.indir_size) return -EINVAL; rxfh_dev.indir = kcalloc(rxfh_dev.indir_size, sizeof(rxfh_dev.indir[0]), GFP_USER); if (!rxfh_dev.indir) return -ENOMEM; rx_rings.cmd = ETHTOOL_GRXRINGS; ret = ops->get_rxnfc(dev, &rx_rings, NULL); if (ret) goto out; if (user_size == 0) { u32 *indir = rxfh_dev.indir; for (i = 0; i < rxfh_dev.indir_size; i++) indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); } else { ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + ringidx_offset, &rx_rings, rxfh_dev.indir_size); if (ret) goto out; } rxfh_dev.hfunc = ETH_RSS_HASH_NO_CHANGE; ret = ops->set_rxfh(dev, &rxfh_dev, extack); if (ret) goto out; /* indicate whether rxfh was set to default */ if (user_size == 0) dev->priv_flags &= ~IFF_RXFH_CONFIGURED; else dev->priv_flags |= IFF_RXFH_CONFIGURED; out: kfree(rxfh_dev.indir); return ret; } static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxfh_param rxfh_dev = {}; u32 user_indir_size, user_key_size; struct ethtool_rxfh_context *ctx; struct ethtool_rxfh rxfh; u32 indir_bytes; u8 *rss_config; u32 total_size; int ret; if (!ops->get_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) rxfh_dev.key_size = ops->get_rxfh_key_size(dev); if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; user_indir_size = rxfh.indir_size; user_key_size = rxfh.key_size; /* Check that reserved fields are 0 for now */ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ if (rxfh.rss_context && !(ops->cap_rss_ctx_supported || ops->create_rxfh_context)) return -EOPNOTSUPP; rxfh.indir_size = rxfh_dev.indir_size; rxfh.key_size = rxfh_dev.key_size; if (copy_to_user(useraddr, &rxfh, sizeof(rxfh))) return -EFAULT; if ((user_indir_size && user_indir_size != rxfh_dev.indir_size) || (user_key_size && user_key_size != rxfh_dev.key_size)) return -EINVAL; indir_bytes = user_indir_size * sizeof(rxfh_dev.indir[0]); total_size = indir_bytes + user_key_size; rss_config = kzalloc(total_size, GFP_USER); if (!rss_config) return -ENOMEM; if (user_indir_size) rxfh_dev.indir = (u32 *)rss_config; if (user_key_size) rxfh_dev.key = rss_config + indir_bytes; if (rxfh.rss_context) { ctx = xa_load(&dev->ethtool->rss_ctx, rxfh.rss_context); if (!ctx) { ret = -ENOENT; goto out; } if (rxfh_dev.indir) memcpy(rxfh_dev.indir, ethtool_rxfh_context_indir(ctx), indir_bytes); if (!ops->rxfh_per_ctx_key) { rxfh_dev.key_size = 0; } else { if (rxfh_dev.key) memcpy(rxfh_dev.key, ethtool_rxfh_context_key(ctx), user_key_size); rxfh_dev.hfunc = ctx->hfunc; } rxfh_dev.input_xfrm = ctx->input_xfrm; ret = 0; } else { ret = dev->ethtool_ops->get_rxfh(dev, &rxfh_dev); if (ret) goto out; } if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc), &rxfh_dev.hfunc, sizeof(rxfh.hfunc))) { ret = -EFAULT; } else if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, input_xfrm), &rxfh_dev.input_xfrm, sizeof(rxfh.input_xfrm))) { ret = -EFAULT; } else if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, key_size), &rxfh_dev.key_size, sizeof(rxfh.key_size))) { ret = -EFAULT; } else if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_config[0]), rss_config, total_size)) { ret = -EFAULT; } out: kfree(rss_config); return ret; } static struct ethtool_rxfh_context * ethtool_rxfh_ctx_alloc(const struct ethtool_ops *ops, u32 indir_size, u32 key_size) { size_t indir_bytes, flex_len, key_off, size; struct ethtool_rxfh_context *ctx; u32 priv_bytes, indir_max; u16 key_max; key_max = max(key_size, ops->rxfh_key_space); indir_max = max(indir_size, ops->rxfh_indir_space); priv_bytes = ALIGN(ops->rxfh_priv_size, sizeof(u32)); indir_bytes = array_size(indir_max, sizeof(u32)); key_off = size_add(priv_bytes, indir_bytes); flex_len = size_add(key_off, key_max); size = struct_size_t(struct ethtool_rxfh_context, data, flex_len); ctx = kzalloc(size, GFP_KERNEL_ACCOUNT); if (!ctx) return NULL; ctx->indir_size = indir_size; ctx->key_size = key_size; ctx->key_off = key_off; ctx->priv_size = ops->rxfh_priv_size; ctx->hfunc = ETH_RSS_HASH_NO_CHANGE; ctx->input_xfrm = RXH_XFRM_NO_CHANGE; return ctx; } static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, void __user *useraddr) { u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]); const struct ethtool_ops *ops = dev->ethtool_ops; u32 dev_indir_size = 0, dev_key_size = 0, i; u32 user_indir_len = 0, indir_bytes = 0; struct ethtool_rxfh_param rxfh_dev = {}; struct ethtool_rxfh_context *ctx = NULL; struct netlink_ext_ack *extack = NULL; struct ethtool_rxnfc rx_rings; struct ethtool_rxfh rxfh; bool locked = false; /* dev->ethtool->rss_lock taken */ bool create = false; u8 *rss_config; int ret; if (!ops->get_rxnfc || !ops->set_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) dev_indir_size = ops->get_rxfh_indir_size(dev); if (ops->get_rxfh_key_size) dev_key_size = ops->get_rxfh_key_size(dev); if (copy_from_user(&rxfh, useraddr, sizeof(rxfh))) return -EFAULT; /* Check that reserved fields are 0 for now */ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32) return -EINVAL; /* Most drivers don't handle rss_context, check it's 0 as well */ if (rxfh.rss_context && !(ops->cap_rss_ctx_supported || ops->create_rxfh_context)) return -EOPNOTSUPP; /* Check input data transformation capabilities */ if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR && rxfh.input_xfrm != RXH_XFRM_NO_CHANGE) return -EINVAL; if (rxfh.input_xfrm != RXH_XFRM_NO_CHANGE && (rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && !ops->cap_rss_sym_xor_supported) return -EOPNOTSUPP; create = rxfh.rss_context == ETH_RXFH_CONTEXT_ALLOC; if ((rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE && rxfh.indir_size != dev_indir_size) || (rxfh.key_size && rxfh.key_size != dev_key_size)) return -EINVAL; /* Must request at least one change: indir size, hash key, function * or input transformation. * There's no need for any of it in case of context creation. */ if (!create && (rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE && rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE && rxfh.input_xfrm == RXH_XFRM_NO_CHANGE)) return -EINVAL; indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]); /* Check settings which may be global rather than per RSS-context */ if (rxfh.rss_context && !ops->rxfh_per_ctx_key) if (rxfh.key_size || (rxfh.hfunc && rxfh.hfunc != ETH_RSS_HASH_NO_CHANGE) || (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_NO_CHANGE)) return -EOPNOTSUPP; rss_config = kzalloc(indir_bytes + dev_key_size, GFP_USER); if (!rss_config) return -ENOMEM; rx_rings.cmd = ETHTOOL_GRXRINGS; ret = ops->get_rxnfc(dev, &rx_rings, NULL); if (ret) goto out; /* rxfh.indir_size == 0 means reset the indir table to default (master * context) or delete the context (other RSS contexts). * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged. */ if (rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) { user_indir_len = indir_bytes; rxfh_dev.indir = (u32 *)rss_config; rxfh_dev.indir_size = dev_indir_size; ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + rss_cfg_offset, &rx_rings, rxfh.indir_size); if (ret) goto out; } else if (rxfh.indir_size == 0) { if (rxfh.rss_context == 0) { u32 *indir; rxfh_dev.indir = (u32 *)rss_config; rxfh_dev.indir_size = dev_indir_size; indir = rxfh_dev.indir; for (i = 0; i < dev_indir_size; i++) indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); } else { rxfh_dev.rss_delete = true; } } if (rxfh.key_size) { rxfh_dev.key_size = dev_key_size; rxfh_dev.key = rss_config + indir_bytes; if (copy_from_user(rxfh_dev.key, useraddr + rss_cfg_offset + user_indir_len, rxfh.key_size)) { ret = -EFAULT; goto out; } } if (rxfh.rss_context) { mutex_lock(&dev->ethtool->rss_lock); locked = true; } if (create) { if (rxfh_dev.rss_delete) { ret = -EINVAL; goto out; } ctx = ethtool_rxfh_ctx_alloc(ops, dev_indir_size, dev_key_size); if (!ctx) { ret = -ENOMEM; goto out; } if (ops->create_rxfh_context) { u32 limit = ops->rxfh_max_num_contexts ?: U32_MAX; u32 ctx_id; /* driver uses new API, core allocates ID */ ret = xa_alloc(&dev->ethtool->rss_ctx, &ctx_id, ctx, XA_LIMIT(1, limit - 1), GFP_KERNEL_ACCOUNT); if (ret < 0) { kfree(ctx); goto out; } WARN_ON(!ctx_id); /* can't happen */ rxfh.rss_context = ctx_id; } } else if (rxfh.rss_context) { ctx = xa_load(&dev->ethtool->rss_ctx, rxfh.rss_context); if (!ctx) { ret = -ENOENT; goto out; } } rxfh_dev.hfunc = rxfh.hfunc; rxfh_dev.rss_context = rxfh.rss_context; rxfh_dev.input_xfrm = rxfh.input_xfrm; if (rxfh.rss_context && ops->create_rxfh_context) { if (create) { ret = ops->create_rxfh_context(dev, ctx, &rxfh_dev, extack); /* Make sure driver populates defaults */ WARN_ON_ONCE(!ret && !rxfh_dev.key && !memchr_inv(ethtool_rxfh_context_key(ctx), 0, ctx->key_size)); } else if (rxfh_dev.rss_delete) { ret = ops->remove_rxfh_context(dev, ctx, rxfh.rss_context, extack); } else { ret = ops->modify_rxfh_context(dev, ctx, &rxfh_dev, extack); } } else { ret = ops->set_rxfh(dev, &rxfh_dev, extack); } if (ret) { if (create) { /* failed to create, free our new tracking entry */ if (ops->create_rxfh_context) xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context); kfree(ctx); } goto out; } if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context), &rxfh_dev.rss_context, sizeof(rxfh_dev.rss_context))) ret = -EFAULT; if (!rxfh_dev.rss_context) { /* indicate whether rxfh was set to default */ if (rxfh.indir_size == 0) dev->priv_flags &= ~IFF_RXFH_CONFIGURED; else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) dev->priv_flags |= IFF_RXFH_CONFIGURED; } /* Update rss_ctx tracking */ if (create && !ops->create_rxfh_context) { /* driver uses old API, it chose context ID */ if (WARN_ON(xa_load(&dev->ethtool->rss_ctx, rxfh_dev.rss_context))) { /* context ID reused, our tracking is screwed */ kfree(ctx); goto out; } /* Allocate the exact ID the driver gave us */ if (xa_is_err(xa_store(&dev->ethtool->rss_ctx, rxfh_dev.rss_context, ctx, GFP_KERNEL))) { kfree(ctx); goto out; } /* Fetch the defaults for the old API, in the new API drivers * should write defaults into ctx themselves. */ rxfh_dev.indir = (u32 *)rss_config; rxfh_dev.indir_size = dev_indir_size; rxfh_dev.key = rss_config + indir_bytes; rxfh_dev.key_size = dev_key_size; ret = ops->get_rxfh(dev, &rxfh_dev); if (WARN_ON(ret)) { xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context); kfree(ctx); goto out; } } if (rxfh_dev.rss_delete) { WARN_ON(xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context) != ctx); kfree(ctx); } else if (ctx) { if (rxfh_dev.indir) { for (i = 0; i < dev_indir_size; i++) ethtool_rxfh_context_indir(ctx)[i] = rxfh_dev.indir[i]; ctx->indir_configured = rxfh.indir_size && rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE; } if (rxfh_dev.key) { memcpy(ethtool_rxfh_context_key(ctx), rxfh_dev.key, dev_key_size); ctx->key_configured = !!rxfh.key_size; } if (rxfh_dev.hfunc != ETH_RSS_HASH_NO_CHANGE) ctx->hfunc = rxfh_dev.hfunc; if (rxfh_dev.input_xfrm != RXH_XFRM_NO_CHANGE) ctx->input_xfrm = rxfh_dev.input_xfrm; } out: if (locked) mutex_unlock(&dev->ethtool->rss_lock); kfree(rss_config); return ret; } static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) { struct ethtool_regs regs; const struct ethtool_ops *ops = dev->ethtool_ops; void *regbuf; int reglen, ret; if (!ops->get_regs || !ops->get_regs_len) return -EOPNOTSUPP; if (copy_from_user(®s, useraddr, sizeof(regs))) return -EFAULT; reglen = ops->get_regs_len(dev); if (reglen <= 0) return reglen; if (regs.len > reglen) regs.len = reglen; regbuf = vzalloc(reglen); if (!regbuf) return -ENOMEM; if (regs.len < reglen) reglen = regs.len; ops->get_regs(dev, ®s, regbuf); ret = -EFAULT; if (copy_to_user(useraddr, ®s, sizeof(regs))) goto out; useraddr += offsetof(struct ethtool_regs, data); if (copy_to_user(useraddr, regbuf, reglen)) goto out; ret = 0; out: vfree(regbuf); return ret; } static int ethtool_reset(struct net_device *dev, char __user *useraddr) { struct ethtool_value reset; int ret; if (!dev->ethtool_ops->reset) return -EOPNOTSUPP; if (dev->ethtool->module_fw_flash_in_progress) return -EBUSY; if (copy_from_user(&reset, useraddr, sizeof(reset))) return -EFAULT; ret = dev->ethtool_ops->reset(dev, &reset.data); if (ret) return ret; if (copy_to_user(useraddr, &reset, sizeof(reset))) return -EFAULT; return 0; } static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol; if (!dev->ethtool_ops->get_wol) return -EOPNOTSUPP; memset(&wol, 0, sizeof(struct ethtool_wolinfo)); wol.cmd = ETHTOOL_GWOL; dev->ethtool_ops->get_wol(dev, &wol); if (copy_to_user(useraddr, &wol, sizeof(wol))) return -EFAULT; return 0; } static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) { struct ethtool_wolinfo wol, cur_wol; int ret; if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol) return -EOPNOTSUPP; memset(&cur_wol, 0, sizeof(struct ethtool_wolinfo)); cur_wol.cmd = ETHTOOL_GWOL; dev->ethtool_ops->get_wol(dev, &cur_wol); if (copy_from_user(&wol, useraddr, sizeof(wol))) return -EFAULT; if (wol.wolopts & ~cur_wol.supported) return -EINVAL; if (wol.wolopts == cur_wol.wolopts && !memcmp(wol.sopass, cur_wol.sopass, sizeof(wol.sopass))) return 0; ret = dev->ethtool_ops->set_wol(dev, &wol); if (ret) return ret; dev->ethtool->wol_enabled = !!wol.wolopts; ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF, NULL); return 0; } static void eee_to_keee(struct ethtool_keee *keee, const struct ethtool_eee *eee) { memset(keee, 0, sizeof(*keee)); keee->eee_enabled = eee->eee_enabled; keee->tx_lpi_enabled = eee->tx_lpi_enabled; keee->tx_lpi_timer = eee->tx_lpi_timer; ethtool_convert_legacy_u32_to_link_mode(keee->advertised, eee->advertised); } static void keee_to_eee(struct ethtool_eee *eee, const struct ethtool_keee *keee) { bool overflow; memset(eee, 0, sizeof(*eee)); eee->eee_active = keee->eee_active; eee->eee_enabled = keee->eee_enabled; eee->tx_lpi_enabled = keee->tx_lpi_enabled; eee->tx_lpi_timer = keee->tx_lpi_timer; overflow = !ethtool_convert_link_mode_to_legacy_u32(&eee->supported, keee->supported); ethtool_convert_link_mode_to_legacy_u32(&eee->advertised, keee->advertised); ethtool_convert_link_mode_to_legacy_u32(&eee->lp_advertised, keee->lp_advertised); if (overflow) pr_warn("Ethtool ioctl interface doesn't support passing EEE linkmodes beyond bit 32\n"); } static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) { struct ethtool_keee keee; struct ethtool_eee eee; int rc; if (!dev->ethtool_ops->get_eee) return -EOPNOTSUPP; memset(&keee, 0, sizeof(keee)); rc = dev->ethtool_ops->get_eee(dev, &keee); if (rc) return rc; keee_to_eee(&eee, &keee); if (copy_to_user(useraddr, &eee, sizeof(eee))) return -EFAULT; return 0; } static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) { struct ethtool_keee keee; struct ethtool_eee eee; int ret; if (!dev->ethtool_ops->set_eee) return -EOPNOTSUPP; if (copy_from_user(&eee, useraddr, sizeof(eee))) return -EFAULT; eee_to_keee(&keee, &eee); ret = dev->ethtool_ops->set_eee(dev, &keee); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL); return ret; } static int ethtool_nway_reset(struct net_device *dev) { if (!dev->ethtool_ops->nway_reset) return -EOPNOTSUPP; return dev->ethtool_ops->nway_reset(dev); } static int ethtool_get_link(struct net_device *dev, char __user *useraddr) { struct ethtool_value edata = { .cmd = ETHTOOL_GLINK }; int link = __ethtool_get_link(dev); if (link < 0) return link; edata.data = link; if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr, int (*getter)(struct net_device *, struct ethtool_eeprom *, u8 *), u32 total_len) { struct ethtool_eeprom eeprom; void __user *userbuf = useraddr + sizeof(eeprom); u32 bytes_remaining; u8 *data; int ret = 0; if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) return -EFAULT; /* Check for wrap and zero */ if (eeprom.offset + eeprom.len <= eeprom.offset) return -EINVAL; /* Check for exceeding total eeprom len */ if (eeprom.offset + eeprom.len > total_len) return -EINVAL; data = kzalloc(PAGE_SIZE, GFP_USER); if (!data) return -ENOMEM; bytes_remaining = eeprom.len; while (bytes_remaining > 0) { eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); ret = getter(dev, &eeprom, data); if (ret) break; if (!eeprom.len) { ret = -EIO; break; } if (copy_to_user(userbuf, data, eeprom.len)) { ret = -EFAULT; break; } userbuf += eeprom.len; eeprom.offset += eeprom.len; bytes_remaining -= eeprom.len; } eeprom.len = userbuf - (useraddr + sizeof(eeprom)); eeprom.offset -= eeprom.len; if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) ret = -EFAULT; kfree(data); return ret; } static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) { const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->get_eeprom || !ops->get_eeprom_len || !ops->get_eeprom_len(dev)) return -EOPNOTSUPP; return ethtool_get_any_eeprom(dev, useraddr, ops->get_eeprom, ops->get_eeprom_len(dev)); } static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) { struct ethtool_eeprom eeprom; const struct ethtool_ops *ops = dev->ethtool_ops; void __user *userbuf = useraddr + sizeof(eeprom); u32 bytes_remaining; u8 *data; int ret = 0; if (!ops->set_eeprom || !ops->get_eeprom_len || !ops->get_eeprom_len(dev)) return -EOPNOTSUPP; if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) return -EFAULT; /* Check for wrap and zero */ if (eeprom.offset + eeprom.len <= eeprom.offset) return -EINVAL; /* Check for exceeding total eeprom len */ if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) return -EINVAL; data = kzalloc(PAGE_SIZE, GFP_USER); if (!data) return -ENOMEM; bytes_remaining = eeprom.len; while (bytes_remaining > 0) { eeprom.len = min(bytes_remaining, (u32)PAGE_SIZE); if (copy_from_user(data, userbuf, eeprom.len)) { ret = -EFAULT; break; } ret = ops->set_eeprom(dev, &eeprom, data); if (ret) break; userbuf += eeprom.len; eeprom.offset += eeprom.len; bytes_remaining -= eeprom.len; } kfree(data); return ret; } static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; struct kernel_ethtool_coalesce kernel_coalesce = {}; int ret; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, NULL); if (ret) return ret; if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) return -EFAULT; return 0; } static bool ethtool_set_coalesce_supported(struct net_device *dev, struct ethtool_coalesce *coalesce) { u32 supported_params = dev->ethtool_ops->supported_coalesce_params; u32 nonzero_params = 0; if (coalesce->rx_coalesce_usecs) nonzero_params |= ETHTOOL_COALESCE_RX_USECS; if (coalesce->rx_max_coalesced_frames) nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES; if (coalesce->rx_coalesce_usecs_irq) nonzero_params |= ETHTOOL_COALESCE_RX_USECS_IRQ; if (coalesce->rx_max_coalesced_frames_irq) nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ; if (coalesce->tx_coalesce_usecs) nonzero_params |= ETHTOOL_COALESCE_TX_USECS; if (coalesce->tx_max_coalesced_frames) nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES; if (coalesce->tx_coalesce_usecs_irq) nonzero_params |= ETHTOOL_COALESCE_TX_USECS_IRQ; if (coalesce->tx_max_coalesced_frames_irq) nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ; if (coalesce->stats_block_coalesce_usecs) nonzero_params |= ETHTOOL_COALESCE_STATS_BLOCK_USECS; if (coalesce->use_adaptive_rx_coalesce) nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_RX; if (coalesce->use_adaptive_tx_coalesce) nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_TX; if (coalesce->pkt_rate_low) nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_LOW; if (coalesce->rx_coalesce_usecs_low) nonzero_params |= ETHTOOL_COALESCE_RX_USECS_LOW; if (coalesce->rx_max_coalesced_frames_low) nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW; if (coalesce->tx_coalesce_usecs_low) nonzero_params |= ETHTOOL_COALESCE_TX_USECS_LOW; if (coalesce->tx_max_coalesced_frames_low) nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW; if (coalesce->pkt_rate_high) nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_HIGH; if (coalesce->rx_coalesce_usecs_high) nonzero_params |= ETHTOOL_COALESCE_RX_USECS_HIGH; if (coalesce->rx_max_coalesced_frames_high) nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH; if (coalesce->tx_coalesce_usecs_high) nonzero_params |= ETHTOOL_COALESCE_TX_USECS_HIGH; if (coalesce->tx_max_coalesced_frames_high) nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH; if (coalesce->rate_sample_interval) nonzero_params |= ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL; return (supported_params & nonzero_params) == nonzero_params; } static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) { struct kernel_ethtool_coalesce kernel_coalesce = {}; struct ethtool_coalesce coalesce; int ret; if (!dev->ethtool_ops->set_coalesce || !dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, NULL); if (ret) return ret; if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) return -EFAULT; if (!ethtool_set_coalesce_supported(dev, &coalesce)) return -EOPNOTSUPP; ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, NULL); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL); return ret; } static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) { struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM }; struct kernel_ethtool_ringparam kernel_ringparam = {}; if (!dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; dev->ethtool_ops->get_ringparam(dev, &ringparam, &kernel_ringparam, NULL); if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) return -EFAULT; return 0; } static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; struct kernel_ethtool_ringparam kernel_ringparam; int ret; if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; dev->ethtool_ops->get_ringparam(dev, &max, &kernel_ringparam, NULL); /* ensure new ring parameters are within the maximums */ if (ringparam.rx_pending > max.rx_max_pending || ringparam.rx_mini_pending > max.rx_mini_max_pending || ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending || ringparam.tx_pending > max.tx_max_pending) return -EINVAL; ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, NULL); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF, NULL); return ret; } static noinline_for_stack int ethtool_get_channels(struct net_device *dev, void __user *useraddr) { struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS }; if (!dev->ethtool_ops->get_channels) return -EOPNOTSUPP; dev->ethtool_ops->get_channels(dev, &channels); if (copy_to_user(useraddr, &channels, sizeof(channels))) return -EFAULT; return 0; } static noinline_for_stack int ethtool_set_channels(struct net_device *dev, void __user *useraddr) { struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; u16 from_channel, to_channel; unsigned int i; int ret; if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels) return -EOPNOTSUPP; if (copy_from_user(&channels, useraddr, sizeof(channels))) return -EFAULT; dev->ethtool_ops->get_channels(dev, &curr); if (channels.rx_count == curr.rx_count && channels.tx_count == curr.tx_count && channels.combined_count == curr.combined_count && channels.other_count == curr.other_count) return 0; /* ensure new counts are within the maximums */ if (channels.rx_count > curr.max_rx || channels.tx_count > curr.max_tx || channels.combined_count > curr.max_combined || channels.other_count > curr.max_other) return -EINVAL; /* ensure there is at least one RX and one TX channel */ if (!channels.combined_count && (!channels.rx_count || !channels.tx_count)) return -EINVAL; ret = ethtool_check_max_channel(dev, channels, NULL); if (ret) return ret; /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + min(channels.rx_count, channels.tx_count); to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); for (i = from_channel; i < to_channel; i++) if (xsk_get_pool_from_qid(dev, i)) return -EINVAL; ret = dev->ethtool_ops->set_channels(dev, &channels); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF, NULL); return ret; } static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) { struct ethtool_pauseparam pauseparam = { .cmd = ETHTOOL_GPAUSEPARAM }; if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; dev->ethtool_ops->get_pauseparam(dev, &pauseparam); if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) return -EFAULT; return 0; } static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) { struct ethtool_pauseparam pauseparam; int ret; if (!dev->ethtool_ops->set_pauseparam) return -EOPNOTSUPP; if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) return -EFAULT; ret = dev->ethtool_ops->set_pauseparam(dev, &pauseparam); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL); return ret; } static int ethtool_self_test(struct net_device *dev, char __user *useraddr) { struct ethtool_test test; const struct ethtool_ops *ops = dev->ethtool_ops; u64 *data; int ret, test_len; if (!ops->self_test || !ops->get_sset_count) return -EOPNOTSUPP; test_len = ops->get_sset_count(dev, ETH_SS_TEST); if (test_len < 0) return test_len; WARN_ON(test_len == 0); if (copy_from_user(&test, useraddr, sizeof(test))) return -EFAULT; test.len = test_len; data = kcalloc(test_len, sizeof(u64), GFP_USER); if (!data) return -ENOMEM; netif_testing_on(dev); ops->self_test(dev, &test, data); netif_testing_off(dev); ret = -EFAULT; if (copy_to_user(useraddr, &test, sizeof(test))) goto out; useraddr += sizeof(test); if (copy_to_user(useraddr, data, array_size(test.len, sizeof(u64)))) goto out; ret = 0; out: kfree(data); return ret; } static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) { struct ethtool_gstrings gstrings; u8 *data; int ret; if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) return -EFAULT; ret = __ethtool_get_sset_count(dev, gstrings.string_set); if (ret < 0) return ret; if (ret > S32_MAX / ETH_GSTRING_LEN) return -ENOMEM; WARN_ON_ONCE(!ret); gstrings.len = ret; if (gstrings.len) { data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN)); if (!data) return -ENOMEM; __ethtool_get_strings(dev, gstrings.string_set, data); } else { data = NULL; } ret = -EFAULT; if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) goto out; useraddr += sizeof(gstrings); if (gstrings.len && copy_to_user(useraddr, data, array_size(gstrings.len, ETH_GSTRING_LEN))) goto out; ret = 0; out: vfree(data); return ret; } __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(*data, ETH_GSTRING_LEN, fmt, args); va_end(args); *data += ETH_GSTRING_LEN; } EXPORT_SYMBOL(ethtool_sprintf); void ethtool_puts(u8 **data, const char *str) { strscpy(*data, str, ETH_GSTRING_LEN); *data += ETH_GSTRING_LEN; } EXPORT_SYMBOL(ethtool_puts); static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) { struct ethtool_value id; static bool busy; const struct ethtool_ops *ops = dev->ethtool_ops; netdevice_tracker dev_tracker; int rc; if (!ops->set_phys_id) return -EOPNOTSUPP; if (busy) return -EBUSY; if (copy_from_user(&id, useraddr, sizeof(id))) return -EFAULT; rc = ops->set_phys_id(dev, ETHTOOL_ID_ACTIVE); if (rc < 0) return rc; /* Drop the RTNL lock while waiting, but prevent reentry or * removal of the device. */ busy = true; netdev_hold(dev, &dev_tracker, GFP_KERNEL); rtnl_unlock(); if (rc == 0) { /* Driver will handle this itself */ schedule_timeout_interruptible( id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT); } else { /* Driver expects to be called at twice the frequency in rc */ int n = rc * 2, interval = HZ / n; u64 count = mul_u32_u32(n, id.data); u64 i = 0; do { rtnl_lock(); rc = ops->set_phys_id(dev, (i++ & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); rtnl_unlock(); if (rc) break; schedule_timeout_interruptible(interval); } while (!signal_pending(current) && (!id.data || i < count)); } rtnl_lock(); netdev_put(dev, &dev_tracker); busy = false; (void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE); return rc; } static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) { struct ethtool_stats stats; const struct ethtool_ops *ops = dev->ethtool_ops; u64 *data; int ret, n_stats; if (!ops->get_ethtool_stats || !ops->get_sset_count) return -EOPNOTSUPP; n_stats = ops->get_sset_count(dev, ETH_SS_STATS); if (n_stats < 0) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) return -ENOMEM; WARN_ON_ONCE(!n_stats); if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; stats.n_stats = n_stats; if (n_stats) { data = vzalloc(array_size(n_stats, sizeof(u64))); if (!data) return -ENOMEM; ops->get_ethtool_stats(dev, &stats, data); } else { data = NULL; } ret = -EFAULT; if (copy_to_user(useraddr, &stats, sizeof(stats))) goto out; useraddr += sizeof(stats); if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64)))) goto out; ret = 0; out: vfree(data); return ret; } static int ethtool_vzalloc_stats_array(int n_stats, u64 **data) { if (n_stats < 0) return n_stats; if (n_stats > S32_MAX / sizeof(u64)) return -ENOMEM; if (WARN_ON_ONCE(!n_stats)) return -EOPNOTSUPP; *data = vzalloc(array_size(n_stats, sizeof(u64))); if (!*data) return -ENOMEM; return 0; } static int ethtool_get_phy_stats_phydev(struct phy_device *phydev, struct ethtool_stats *stats, u64 **data) { const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops; int n_stats, ret; if (!phy_ops || !phy_ops->get_sset_count || !phy_ops->get_stats) return -EOPNOTSUPP; n_stats = phy_ops->get_sset_count(phydev); ret = ethtool_vzalloc_stats_array(n_stats, data); if (ret) return ret; stats->n_stats = n_stats; return phy_ops->get_stats(phydev, stats, *data); } static int ethtool_get_phy_stats_ethtool(struct net_device *dev, struct ethtool_stats *stats, u64 **data) { const struct ethtool_ops *ops = dev->ethtool_ops; int n_stats, ret; if (!ops || !ops->get_sset_count || !ops->get_ethtool_phy_stats) return -EOPNOTSUPP; n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS); ret = ethtool_vzalloc_stats_array(n_stats, data); if (ret) return ret; stats->n_stats = n_stats; ops->get_ethtool_phy_stats(dev, stats, *data); return 0; } static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr) { struct phy_device *phydev = dev->phydev; struct ethtool_stats stats; u64 *data = NULL; int ret = -EOPNOTSUPP; if (copy_from_user(&stats, useraddr, sizeof(stats))) return -EFAULT; if (phydev) ret = ethtool_get_phy_stats_phydev(phydev, &stats, &data); if (ret == -EOPNOTSUPP) ret = ethtool_get_phy_stats_ethtool(dev, &stats, &data); if (ret) goto out; if (copy_to_user(useraddr, &stats, sizeof(stats))) { ret = -EFAULT; goto out; } useraddr += sizeof(stats); if (copy_to_user(useraddr, data, array_size(stats.n_stats, sizeof(u64)))) ret = -EFAULT; out: vfree(data); return ret; } static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr) { struct ethtool_perm_addr epaddr; if (copy_from_user(&epaddr, useraddr, sizeof(epaddr))) return -EFAULT; if (epaddr.size < dev->addr_len) return -ETOOSMALL; epaddr.size = dev->addr_len; if (copy_to_user(useraddr, &epaddr, sizeof(epaddr))) return -EFAULT; useraddr += sizeof(epaddr); if (copy_to_user(useraddr, dev->perm_addr, epaddr.size)) return -EFAULT; return 0; } static int ethtool_get_value(struct net_device *dev, char __user *useraddr, u32 cmd, u32 (*actor)(struct net_device *)) { struct ethtool_value edata = { .cmd = cmd }; if (!actor) return -EOPNOTSUPP; edata.data = actor(dev); if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; return 0; } static int ethtool_set_value_void(struct net_device *dev, char __user *useraddr, void (*actor)(struct net_device *, u32)) { struct ethtool_value edata; if (!actor) return -EOPNOTSUPP; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; actor(dev, edata.data); return 0; } static int ethtool_set_value(struct net_device *dev, char __user *useraddr, int (*actor)(struct net_device *, u32)) { struct ethtool_value edata; if (!actor) return -EOPNOTSUPP; if (copy_from_user(&edata, useraddr, sizeof(edata))) return -EFAULT; return actor(dev, edata.data); } static int ethtool_flash_device(struct net_device *dev, struct ethtool_devlink_compat *req) { if (!dev->ethtool_ops->flash_device) { req->devlink = netdev_to_devlink_get(dev); return 0; } return dev->ethtool_ops->flash_device(dev, &req->efl); } static int ethtool_set_dump(struct net_device *dev, void __user *useraddr) { struct ethtool_dump dump; if (!dev->ethtool_ops->set_dump) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) return -EFAULT; return dev->ethtool_ops->set_dump(dev, &dump); } static int ethtool_get_dump_flag(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_dump dump; const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->get_dump_flag) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) return -EFAULT; ret = ops->get_dump_flag(dev, &dump); if (ret) return ret; if (copy_to_user(useraddr, &dump, sizeof(dump))) return -EFAULT; return 0; } static int ethtool_get_dump_data(struct net_device *dev, void __user *useraddr) { int ret; __u32 len; struct ethtool_dump dump, tmp; const struct ethtool_ops *ops = dev->ethtool_ops; void *data = NULL; if (!ops->get_dump_data || !ops->get_dump_flag) return -EOPNOTSUPP; if (copy_from_user(&dump, useraddr, sizeof(dump))) return -EFAULT; memset(&tmp, 0, sizeof(tmp)); tmp.cmd = ETHTOOL_GET_DUMP_FLAG; ret = ops->get_dump_flag(dev, &tmp); if (ret) return ret; len = min(tmp.len, dump.len); if (!len) return -EFAULT; /* Don't ever let the driver think there's more space available * than it requested with .get_dump_flag(). */ dump.len = len; /* Always allocate enough space to hold the whole thing so that the * driver does not need to check the length and bother with partial * dumping. */ data = vzalloc(tmp.len); if (!data) return -ENOMEM; ret = ops->get_dump_data(dev, &dump, data); if (ret) goto out; /* There are two sane possibilities: * 1. The driver's .get_dump_data() does not touch dump.len. * 2. Or it may set dump.len to how much it really writes, which * should be tmp.len (or len if it can do a partial dump). * In any case respond to userspace with the actual length of data * it's receiving. */ WARN_ON(dump.len != len && dump.len != tmp.len); dump.len = len; if (copy_to_user(useraddr, &dump, sizeof(dump))) { ret = -EFAULT; goto out; } useraddr += offsetof(struct ethtool_dump, data); if (copy_to_user(useraddr, data, len)) ret = -EFAULT; out: vfree(data); return ret; } static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr) { struct kernel_ethtool_ts_info kernel_info; struct ethtool_ts_info info = {}; int err; err = __ethtool_get_ts_info(dev, &kernel_info); if (err) return err; info.cmd = kernel_info.cmd; info.so_timestamping = kernel_info.so_timestamping; info.phc_index = kernel_info.phc_index; info.tx_types = kernel_info.tx_types; info.rx_filters = kernel_info.rx_filters; if (copy_to_user(useraddr, &info, sizeof(info))) return -EFAULT; return 0; } int ethtool_get_module_info_call(struct net_device *dev, struct ethtool_modinfo *modinfo) { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; if (dev->ethtool->module_fw_flash_in_progress) return -EBUSY; if (dev->sfp_bus) return sfp_get_module_info(dev->sfp_bus, modinfo); if (phydev && phydev->drv && phydev->drv->module_info) return phydev->drv->module_info(phydev, modinfo); if (ops->get_module_info) return ops->get_module_info(dev, modinfo); return -EOPNOTSUPP; } static int ethtool_get_module_info(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_modinfo modinfo; if (copy_from_user(&modinfo, useraddr, sizeof(modinfo))) return -EFAULT; ret = ethtool_get_module_info_call(dev, &modinfo); if (ret) return ret; if (copy_to_user(useraddr, &modinfo, sizeof(modinfo))) return -EFAULT; return 0; } int ethtool_get_module_eeprom_call(struct net_device *dev, struct ethtool_eeprom *ee, u8 *data) { const struct ethtool_ops *ops = dev->ethtool_ops; struct phy_device *phydev = dev->phydev; if (dev->ethtool->module_fw_flash_in_progress) return -EBUSY; if (dev->sfp_bus) return sfp_get_module_eeprom(dev->sfp_bus, ee, data); if (phydev && phydev->drv && phydev->drv->module_eeprom) return phydev->drv->module_eeprom(phydev, ee, data); if (ops->get_module_eeprom) return ops->get_module_eeprom(dev, ee, data); return -EOPNOTSUPP; } static int ethtool_get_module_eeprom(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_modinfo modinfo; ret = ethtool_get_module_info_call(dev, &modinfo); if (ret) return ret; return ethtool_get_any_eeprom(dev, useraddr, ethtool_get_module_eeprom_call, modinfo.eeprom_len); } static int ethtool_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_RX_COPYBREAK: case ETHTOOL_TX_COPYBREAK: case ETHTOOL_TX_COPYBREAK_BUF_SIZE: if (tuna->len != sizeof(u32) || tuna->type_id != ETHTOOL_TUNABLE_U32) return -EINVAL; break; case ETHTOOL_PFC_PREVENTION_TOUT: if (tuna->len != sizeof(u16) || tuna->type_id != ETHTOOL_TUNABLE_U16) return -EINVAL; break; default: return -EINVAL; } return 0; } static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_tunable tuna; const struct ethtool_ops *ops = dev->ethtool_ops; void *data; if (!ops->get_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_tunable_valid(&tuna); if (ret) return ret; data = kzalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; ret = ops->get_tunable(dev, &tuna, data); if (ret) goto out; useraddr += sizeof(tuna); ret = -EFAULT; if (copy_to_user(useraddr, data, tuna.len)) goto out; ret = 0; out: kfree(data); return ret; } static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr) { int ret; struct ethtool_tunable tuna; const struct ethtool_ops *ops = dev->ethtool_ops; void *data; if (!ops->set_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_tunable_valid(&tuna); if (ret) return ret; useraddr += sizeof(tuna); data = memdup_user(useraddr, tuna.len); if (IS_ERR(data)) return PTR_ERR(data); ret = ops->set_tunable(dev, &tuna, data); kfree(data); return ret; } static noinline_for_stack int ethtool_get_per_queue_coalesce(struct net_device *dev, void __user *useraddr, struct ethtool_per_queue_op *per_queue_opt) { u32 bit; int ret; DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); if (!dev->ethtool_ops->get_per_queue_coalesce) return -EOPNOTSUPP; useraddr += sizeof(*per_queue_opt); bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE); for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce); if (ret != 0) return ret; if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) return -EFAULT; useraddr += sizeof(coalesce); } return 0; } static noinline_for_stack int ethtool_set_per_queue_coalesce(struct net_device *dev, void __user *useraddr, struct ethtool_per_queue_op *per_queue_opt) { u32 bit; int i, ret = 0; int n_queue; struct ethtool_coalesce *backup = NULL, *tmp = NULL; DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE); if ((!dev->ethtool_ops->set_per_queue_coalesce) || (!dev->ethtool_ops->get_per_queue_coalesce)) return -EOPNOTSUPP; useraddr += sizeof(*per_queue_opt); bitmap_from_arr32(queue_mask, per_queue_opt->queue_mask, MAX_NUM_QUEUE); n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE); tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL); if (!backup) return -ENOMEM; for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) { struct ethtool_coalesce coalesce; ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp); if (ret != 0) goto roll_back; tmp++; if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) { ret = -EFAULT; goto roll_back; } if (!ethtool_set_coalesce_supported(dev, &coalesce)) { ret = -EOPNOTSUPP; goto roll_back; } ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce); if (ret != 0) goto roll_back; useraddr += sizeof(coalesce); } roll_back: if (ret != 0) { tmp = backup; for_each_set_bit(i, queue_mask, bit) { dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp); tmp++; } } kfree(backup); return ret; } static int noinline_for_stack ethtool_set_per_queue(struct net_device *dev, void __user *useraddr, u32 sub_cmd) { struct ethtool_per_queue_op per_queue_opt; if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt))) return -EFAULT; if (per_queue_opt.sub_command != sub_cmd) return -EINVAL; switch (per_queue_opt.sub_command) { case ETHTOOL_GCOALESCE: return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt); case ETHTOOL_SCOALESCE: return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt); default: return -EOPNOTSUPP; } } static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna) { switch (tuna->id) { case ETHTOOL_PHY_DOWNSHIFT: case ETHTOOL_PHY_FAST_LINK_DOWN: if (tuna->len != sizeof(u8) || tuna->type_id != ETHTOOL_TUNABLE_U8) return -EINVAL; break; case ETHTOOL_PHY_EDPD: if (tuna->len != sizeof(u16) || tuna->type_id != ETHTOOL_TUNABLE_U16) return -EINVAL; break; default: return -EINVAL; } return 0; } static int get_phy_tunable(struct net_device *dev, void __user *useraddr) { struct phy_device *phydev = dev->phydev; struct ethtool_tunable tuna; bool phy_drv_tunable; void *data; int ret; phy_drv_tunable = phydev && phydev->drv && phydev->drv->get_tunable; if (!phy_drv_tunable && !dev->ethtool_ops->get_phy_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_phy_tunable_valid(&tuna); if (ret) return ret; data = kzalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; if (phy_drv_tunable) { mutex_lock(&phydev->lock); ret = phydev->drv->get_tunable(phydev, &tuna, data); mutex_unlock(&phydev->lock); } else { ret = dev->ethtool_ops->get_phy_tunable(dev, &tuna, data); } if (ret) goto out; useraddr += sizeof(tuna); ret = -EFAULT; if (copy_to_user(useraddr, data, tuna.len)) goto out; ret = 0; out: kfree(data); return ret; } static int set_phy_tunable(struct net_device *dev, void __user *useraddr) { struct phy_device *phydev = dev->phydev; struct ethtool_tunable tuna; bool phy_drv_tunable; void *data; int ret; phy_drv_tunable = phydev && phydev->drv && phydev->drv->get_tunable; if (!phy_drv_tunable && !dev->ethtool_ops->set_phy_tunable) return -EOPNOTSUPP; if (copy_from_user(&tuna, useraddr, sizeof(tuna))) return -EFAULT; ret = ethtool_phy_tunable_valid(&tuna); if (ret) return ret; useraddr += sizeof(tuna); data = memdup_user(useraddr, tuna.len); if (IS_ERR(data)) return PTR_ERR(data); if (phy_drv_tunable) { mutex_lock(&phydev->lock); ret = phydev->drv->set_tunable(phydev, &tuna, data); mutex_unlock(&phydev->lock); } else { ret = dev->ethtool_ops->set_phy_tunable(dev, &tuna, data); } kfree(data); return ret; } static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr) { struct ethtool_fecparam fecparam = { .cmd = ETHTOOL_GFECPARAM }; int rc; if (!dev->ethtool_ops->get_fecparam) return -EOPNOTSUPP; rc = dev->ethtool_ops->get_fecparam(dev, &fecparam); if (rc) return rc; if (WARN_ON_ONCE(fecparam.reserved)) fecparam.reserved = 0; if (copy_to_user(useraddr, &fecparam, sizeof(fecparam))) return -EFAULT; return 0; } static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) { struct ethtool_fecparam fecparam; if (!dev->ethtool_ops->set_fecparam) return -EOPNOTSUPP; if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) return -EFAULT; if (!fecparam.fec || fecparam.fec & ETHTOOL_FEC_NONE) return -EINVAL; fecparam.active_fec = 0; fecparam.reserved = 0; return dev->ethtool_ops->set_fecparam(dev, &fecparam); } /* The main entry point in this file. Called from net/core/dev_ioctl.c */ static int __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr, u32 ethcmd, struct ethtool_devlink_compat *devlink_state) { struct net_device *dev; u32 sub_cmd; int rc; netdev_features_t old_features; dev = __dev_get_by_name(net, ifr->ifr_name); if (!dev) return -ENODEV; if (ethcmd == ETHTOOL_PERQUEUE) { if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd))) return -EFAULT; } else { sub_cmd = ethcmd; } /* Allow some commands to be done by anyone */ switch (sub_cmd) { case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: case ETHTOOL_GLINK: case ETHTOOL_GCOALESCE: case ETHTOOL_GRINGPARAM: case ETHTOOL_GPAUSEPARAM: case ETHTOOL_GRXCSUM: case ETHTOOL_GTXCSUM: case ETHTOOL_GSG: case ETHTOOL_GSSET_INFO: case ETHTOOL_GSTRINGS: case ETHTOOL_GSTATS: case ETHTOOL_GPHYSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: case ETHTOOL_GFLAGS: case ETHTOOL_GPFLAGS: case ETHTOOL_GRXFH: case ETHTOOL_GRXRINGS: case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: case ETHTOOL_GRXFHINDIR: case ETHTOOL_GRSSH: case ETHTOOL_GFEATURES: case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: case ETHTOOL_GEEE: case ETHTOOL_GTUNABLE: case ETHTOOL_PHY_GTUNABLE: case ETHTOOL_GLINKSETTINGS: case ETHTOOL_GFECPARAM: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; } if (dev->dev.parent) pm_runtime_get_sync(dev->dev.parent); if (!netif_device_present(dev)) { rc = -ENODEV; goto out; } if (dev->ethtool_ops->begin) { rc = dev->ethtool_ops->begin(dev); if (rc < 0) goto out; } old_features = dev->features; switch (ethcmd) { case ETHTOOL_GSET: rc = ethtool_get_settings(dev, useraddr); break; case ETHTOOL_SSET: rc = ethtool_set_settings(dev, useraddr); break; case ETHTOOL_GDRVINFO: rc = ethtool_get_drvinfo(dev, devlink_state); break; case ETHTOOL_GREGS: rc = ethtool_get_regs(dev, useraddr); break; case ETHTOOL_GWOL: rc = ethtool_get_wol(dev, useraddr); break; case ETHTOOL_SWOL: rc = ethtool_set_wol(dev, useraddr); break; case ETHTOOL_GMSGLVL: rc = ethtool_get_value(dev, useraddr, ethcmd, dev->ethtool_ops->get_msglevel); break; case ETHTOOL_SMSGLVL: rc = ethtool_set_value_void(dev, useraddr, dev->ethtool_ops->set_msglevel); if (!rc) ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF, NULL); break; case ETHTOOL_GEEE: rc = ethtool_get_eee(dev, useraddr); break; case ETHTOOL_SEEE: rc = ethtool_set_eee(dev, useraddr); break; case ETHTOOL_NWAY_RST: rc = ethtool_nway_reset(dev); break; case ETHTOOL_GLINK: rc = ethtool_get_link(dev, useraddr); break; case ETHTOOL_GEEPROM: rc = ethtool_get_eeprom(dev, useraddr); break; case ETHTOOL_SEEPROM: rc = ethtool_set_eeprom(dev, useraddr); break; case ETHTOOL_GCOALESCE: rc = ethtool_get_coalesce(dev, useraddr); break; case ETHTOOL_SCOALESCE: rc = ethtool_set_coalesce(dev, useraddr); break; case ETHTOOL_GRINGPARAM: rc = ethtool_get_ringparam(dev, useraddr); break; case ETHTOOL_SRINGPARAM: rc = ethtool_set_ringparam(dev, useraddr); break; case ETHTOOL_GPAUSEPARAM: rc = ethtool_get_pauseparam(dev, useraddr); break; case ETHTOOL_SPAUSEPARAM: rc = ethtool_set_pauseparam(dev, useraddr); break; case ETHTOOL_TEST: rc = ethtool_self_test(dev, useraddr); break; case ETHTOOL_GSTRINGS: rc = ethtool_get_strings(dev, useraddr); break; case ETHTOOL_PHYS_ID: rc = ethtool_phys_id(dev, useraddr); break; case ETHTOOL_GSTATS: rc = ethtool_get_stats(dev, useraddr); break; case ETHTOOL_GPERMADDR: rc = ethtool_get_perm_addr(dev, useraddr); break; case ETHTOOL_GFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, __ethtool_get_flags); break; case ETHTOOL_SFLAGS: rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags); break; case ETHTOOL_GPFLAGS: rc = ethtool_get_value(dev, useraddr, ethcmd, dev->ethtool_ops->get_priv_flags); if (!rc) ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF, NULL); break; case ETHTOOL_SPFLAGS: rc = ethtool_set_value(dev, useraddr, dev->ethtool_ops->set_priv_flags); break; case ETHTOOL_GRXFH: case ETHTOOL_GRXRINGS: case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: rc = ethtool_get_rxnfc(dev, ethcmd, useraddr); break; case ETHTOOL_SRXFH: case ETHTOOL_SRXCLSRLDEL: case ETHTOOL_SRXCLSRLINS: rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); break; case ETHTOOL_FLASHDEV: rc = ethtool_flash_device(dev, devlink_state); break; case ETHTOOL_RESET: rc = ethtool_reset(dev, useraddr); break; case ETHTOOL_GSSET_INFO: rc = ethtool_get_sset_info(dev, useraddr); break; case ETHTOOL_GRXFHINDIR: rc = ethtool_get_rxfh_indir(dev, useraddr); break; case ETHTOOL_SRXFHINDIR: rc = ethtool_set_rxfh_indir(dev, useraddr); break; case ETHTOOL_GRSSH: rc = ethtool_get_rxfh(dev, useraddr); break; case ETHTOOL_SRSSH: rc = ethtool_set_rxfh(dev, useraddr); break; case ETHTOOL_GFEATURES: rc = ethtool_get_features(dev, useraddr); break; case ETHTOOL_SFEATURES: rc = ethtool_set_features(dev, useraddr); break; case ETHTOOL_GTXCSUM: case ETHTOOL_GRXCSUM: case ETHTOOL_GSG: case ETHTOOL_GTSO: case ETHTOOL_GGSO: case ETHTOOL_GGRO: rc = ethtool_get_one_feature(dev, useraddr, ethcmd); break; case ETHTOOL_STXCSUM: case ETHTOOL_SRXCSUM: case ETHTOOL_SSG: case ETHTOOL_STSO: case ETHTOOL_SGSO: case ETHTOOL_SGRO: rc = ethtool_set_one_feature(dev, useraddr, ethcmd); break; case ETHTOOL_GCHANNELS: rc = ethtool_get_channels(dev, useraddr); break; case ETHTOOL_SCHANNELS: rc = ethtool_set_channels(dev, useraddr); break; case ETHTOOL_SET_DUMP: rc = ethtool_set_dump(dev, useraddr); break; case ETHTOOL_GET_DUMP_FLAG: rc = ethtool_get_dump_flag(dev, useraddr); break; case ETHTOOL_GET_DUMP_DATA: rc = ethtool_get_dump_data(dev, useraddr); break; case ETHTOOL_GET_TS_INFO: rc = ethtool_get_ts_info(dev, useraddr); break; case ETHTOOL_GMODULEINFO: rc = ethtool_get_module_info(dev, useraddr); break; case ETHTOOL_GMODULEEEPROM: rc = ethtool_get_module_eeprom(dev, useraddr); break; case ETHTOOL_GTUNABLE: rc = ethtool_get_tunable(dev, useraddr); break; case ETHTOOL_STUNABLE: rc = ethtool_set_tunable(dev, useraddr); break; case ETHTOOL_GPHYSTATS: rc = ethtool_get_phy_stats(dev, useraddr); break; case ETHTOOL_PERQUEUE: rc = ethtool_set_per_queue(dev, useraddr, sub_cmd); break; case ETHTOOL_GLINKSETTINGS: rc = ethtool_get_link_ksettings(dev, useraddr); break; case ETHTOOL_SLINKSETTINGS: rc = ethtool_set_link_ksettings(dev, useraddr); break; case ETHTOOL_PHY_GTUNABLE: rc = get_phy_tunable(dev, useraddr); break; case ETHTOOL_PHY_STUNABLE: rc = set_phy_tunable(dev, useraddr); break; case ETHTOOL_GFECPARAM: rc = ethtool_get_fecparam(dev, useraddr); break; case ETHTOOL_SFECPARAM: rc = ethtool_set_fecparam(dev, useraddr); break; default: rc = -EOPNOTSUPP; } if (dev->ethtool_ops->complete) dev->ethtool_ops->complete(dev); if (old_features != dev->features) netdev_features_change(dev); out: if (dev->dev.parent) pm_runtime_put(dev->dev.parent); return rc; } int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr) { struct ethtool_devlink_compat *state; u32 ethcmd; int rc; if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return -ENOMEM; switch (ethcmd) { case ETHTOOL_FLASHDEV: if (copy_from_user(&state->efl, useraddr, sizeof(state->efl))) { rc = -EFAULT; goto exit_free; } state->efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; break; } rtnl_lock(); rc = __dev_ethtool(net, ifr, useraddr, ethcmd, state); rtnl_unlock(); if (rc) goto exit_free; switch (ethcmd) { case ETHTOOL_FLASHDEV: if (state->devlink) rc = devlink_compat_flash_update(state->devlink, state->efl.data); break; case ETHTOOL_GDRVINFO: if (state->devlink) devlink_compat_running_version(state->devlink, state->info.fw_version, sizeof(state->info.fw_version)); if (copy_to_user(useraddr, &state->info, sizeof(state->info))) { rc = -EFAULT; goto exit_free; } break; } exit_free: if (state->devlink) devlink_put(state->devlink); kfree(state); return rc; } struct ethtool_rx_flow_key { struct flow_dissector_key_basic basic; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; }; struct flow_dissector_key_ports tp; struct flow_dissector_key_ip ip; struct flow_dissector_key_vlan vlan; struct flow_dissector_key_eth_addrs eth_addrs; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct ethtool_rx_flow_match { struct flow_dissector dissector; struct ethtool_rx_flow_key key; struct ethtool_rx_flow_key mask; }; struct ethtool_rx_flow_rule * ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) { const struct ethtool_rx_flow_spec *fs = input->fs; struct ethtool_rx_flow_match *match; struct ethtool_rx_flow_rule *flow; struct flow_action_entry *act; flow = kzalloc(sizeof(struct ethtool_rx_flow_rule) + sizeof(struct ethtool_rx_flow_match), GFP_KERNEL); if (!flow) return ERR_PTR(-ENOMEM); /* ethtool_rx supports only one single action per rule. */ flow->rule = flow_rule_alloc(1); if (!flow->rule) { kfree(flow); return ERR_PTR(-ENOMEM); } match = (struct ethtool_rx_flow_match *)flow->priv; flow->rule->match.dissector = &match->dissector; flow->rule->match.mask = &match->mask; flow->rule->match.key = &match->key; match->mask.basic.n_proto = htons(0xffff); switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { case ETHER_FLOW: { const struct ethhdr *ether_spec, *ether_m_spec; ether_spec = &fs->h_u.ether_spec; ether_m_spec = &fs->m_u.ether_spec; if (!is_zero_ether_addr(ether_m_spec->h_source)) { ether_addr_copy(match->key.eth_addrs.src, ether_spec->h_source); ether_addr_copy(match->mask.eth_addrs.src, ether_m_spec->h_source); } if (!is_zero_ether_addr(ether_m_spec->h_dest)) { ether_addr_copy(match->key.eth_addrs.dst, ether_spec->h_dest); ether_addr_copy(match->mask.eth_addrs.dst, ether_m_spec->h_dest); } if (ether_m_spec->h_proto) { match->key.basic.n_proto = ether_spec->h_proto; match->mask.basic.n_proto = ether_m_spec->h_proto; } } break; case TCP_V4_FLOW: case UDP_V4_FLOW: { const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec; match->key.basic.n_proto = htons(ETH_P_IP); v4_spec = &fs->h_u.tcp_ip4_spec; v4_m_spec = &fs->m_u.tcp_ip4_spec; if (v4_m_spec->ip4src) { match->key.ipv4.src = v4_spec->ip4src; match->mask.ipv4.src = v4_m_spec->ip4src; } if (v4_m_spec->ip4dst) { match->key.ipv4.dst = v4_spec->ip4dst; match->mask.ipv4.dst = v4_m_spec->ip4dst; } if (v4_m_spec->ip4src || v4_m_spec->ip4dst) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] = offsetof(struct ethtool_rx_flow_key, ipv4); } if (v4_m_spec->psrc) { match->key.tp.src = v4_spec->psrc; match->mask.tp.src = v4_m_spec->psrc; } if (v4_m_spec->pdst) { match->key.tp.dst = v4_spec->pdst; match->mask.tp.dst = v4_m_spec->pdst; } if (v4_m_spec->psrc || v4_m_spec->pdst) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(struct ethtool_rx_flow_key, tp); } if (v4_m_spec->tos) { match->key.ip.tos = v4_spec->tos; match->mask.ip.tos = v4_m_spec->tos; match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_IP); match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = offsetof(struct ethtool_rx_flow_key, ip); } } break; case TCP_V6_FLOW: case UDP_V6_FLOW: { const struct ethtool_tcpip6_spec *v6_spec, *v6_m_spec; match->key.basic.n_proto = htons(ETH_P_IPV6); v6_spec = &fs->h_u.tcp_ip6_spec; v6_m_spec = &fs->m_u.tcp_ip6_spec; if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src)) { memcpy(&match->key.ipv6.src, v6_spec->ip6src, sizeof(match->key.ipv6.src)); memcpy(&match->mask.ipv6.src, v6_m_spec->ip6src, sizeof(match->mask.ipv6.src)); } if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) { memcpy(&match->key.ipv6.dst, v6_spec->ip6dst, sizeof(match->key.ipv6.dst)); memcpy(&match->mask.ipv6.dst, v6_m_spec->ip6dst, sizeof(match->mask.ipv6.dst)); } if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src) || !ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] = offsetof(struct ethtool_rx_flow_key, ipv6); } if (v6_m_spec->psrc) { match->key.tp.src = v6_spec->psrc; match->mask.tp.src = v6_m_spec->psrc; } if (v6_m_spec->pdst) { match->key.tp.dst = v6_spec->pdst; match->mask.tp.dst = v6_m_spec->pdst; } if (v6_m_spec->psrc || v6_m_spec->pdst) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_PORTS); match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] = offsetof(struct ethtool_rx_flow_key, tp); } if (v6_m_spec->tclass) { match->key.ip.tos = v6_spec->tclass; match->mask.ip.tos = v6_m_spec->tclass; match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_IP); match->dissector.offset[FLOW_DISSECTOR_KEY_IP] = offsetof(struct ethtool_rx_flow_key, ip); } } break; default: ethtool_rx_flow_rule_destroy(flow); return ERR_PTR(-EINVAL); } switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) { case TCP_V4_FLOW: case TCP_V6_FLOW: match->key.basic.ip_proto = IPPROTO_TCP; match->mask.basic.ip_proto = 0xff; break; case UDP_V4_FLOW: case UDP_V6_FLOW: match->key.basic.ip_proto = IPPROTO_UDP; match->mask.basic.ip_proto = 0xff; break; } match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_BASIC); match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] = offsetof(struct ethtool_rx_flow_key, basic); if (fs->flow_type & FLOW_EXT) { const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; if (ext_m_spec->vlan_etype) { match->key.vlan.vlan_tpid = ext_h_spec->vlan_etype; match->mask.vlan.vlan_tpid = ext_m_spec->vlan_etype; } if (ext_m_spec->vlan_tci) { match->key.vlan.vlan_id = ntohs(ext_h_spec->vlan_tci) & 0x0fff; match->mask.vlan.vlan_id = ntohs(ext_m_spec->vlan_tci) & 0x0fff; match->key.vlan.vlan_dei = !!(ext_h_spec->vlan_tci & htons(0x1000)); match->mask.vlan.vlan_dei = !!(ext_m_spec->vlan_tci & htons(0x1000)); match->key.vlan.vlan_priority = (ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13; match->mask.vlan.vlan_priority = (ntohs(ext_m_spec->vlan_tci) & 0xe000) >> 13; } if (ext_m_spec->vlan_etype || ext_m_spec->vlan_tci) { match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct ethtool_rx_flow_key, vlan); } } if (fs->flow_type & FLOW_MAC_EXT) { const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext; const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext; memcpy(match->key.eth_addrs.dst, ext_h_spec->h_dest, ETH_ALEN); memcpy(match->mask.eth_addrs.dst, ext_m_spec->h_dest, ETH_ALEN); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS); match->dissector.offset[FLOW_DISSECTOR_KEY_ETH_ADDRS] = offsetof(struct ethtool_rx_flow_key, eth_addrs); } act = &flow->rule->action.entries[0]; switch (fs->ring_cookie) { case RX_CLS_FLOW_DISC: act->id = FLOW_ACTION_DROP; break; case RX_CLS_FLOW_WAKE: act->id = FLOW_ACTION_WAKE; break; default: act->id = FLOW_ACTION_QUEUE; if (fs->flow_type & FLOW_RSS) act->queue.ctx = input->rss_ctx; act->queue.vf = ethtool_get_flow_spec_ring_vf(fs->ring_cookie); act->queue.index = ethtool_get_flow_spec_ring(fs->ring_cookie); break; } return flow; } EXPORT_SYMBOL(ethtool_rx_flow_rule_create); void ethtool_rx_flow_rule_destroy(struct ethtool_rx_flow_rule *flow) { kfree(flow->rule); kfree(flow); } EXPORT_SYMBOL(ethtool_rx_flow_rule_destroy); |
4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | // SPDX-License-Identifier: GPL-2.0 /* * Block stat tracking code * * Copyright (C) 2016 Jens Axboe */ #include <linux/kernel.h> #include <linux/rculist.h> #include "blk-stat.h" #include "blk-mq.h" #include "blk.h" struct blk_queue_stats { struct list_head callbacks; spinlock_t lock; int accounting; }; void blk_rq_stat_init(struct blk_rq_stat *stat) { stat->min = -1ULL; stat->max = stat->nr_samples = stat->mean = 0; stat->batch = 0; } /* src is a per-cpu stat, mean isn't initialized */ void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { if (dst->nr_samples + src->nr_samples <= dst->nr_samples) return; dst->min = min(dst->min, src->min); dst->max = max(dst->max, src->max); dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples, dst->nr_samples + src->nr_samples); dst->nr_samples += src->nr_samples; } void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value) { stat->min = min(stat->min, value); stat->max = max(stat->max, value); stat->batch += value; stat->nr_samples++; } void blk_stat_add(struct request *rq, u64 now) { struct request_queue *q = rq->q; struct blk_stat_callback *cb; struct blk_rq_stat *stat; int bucket, cpu; u64 value; value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; rcu_read_lock(); cpu = get_cpu(); list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { if (!blk_stat_is_active(cb)) continue; bucket = cb->bucket_fn(rq); if (bucket < 0) continue; stat = &per_cpu_ptr(cb->cpu_stat, cpu)[bucket]; blk_rq_stat_add(stat, value); } put_cpu(); rcu_read_unlock(); } static void blk_stat_timer_fn(struct timer_list *t) { struct blk_stat_callback *cb = from_timer(cb, t, timer); unsigned int bucket; int cpu; for (bucket = 0; bucket < cb->buckets; bucket++) blk_rq_stat_init(&cb->stat[bucket]); for_each_online_cpu(cpu) { struct blk_rq_stat *cpu_stat; cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); for (bucket = 0; bucket < cb->buckets; bucket++) { blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); blk_rq_stat_init(&cpu_stat[bucket]); } } cb->timer_fn(cb); } struct blk_stat_callback * blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), int (*bucket_fn)(const struct request *), unsigned int buckets, void *data) { struct blk_stat_callback *cb; cb = kmalloc(sizeof(*cb), GFP_KERNEL); if (!cb) return NULL; cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat), GFP_KERNEL); if (!cb->stat) { kfree(cb); return NULL; } cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat), __alignof__(struct blk_rq_stat)); if (!cb->cpu_stat) { kfree(cb->stat); kfree(cb); return NULL; } cb->timer_fn = timer_fn; cb->bucket_fn = bucket_fn; cb->data = data; cb->buckets = buckets; timer_setup(&cb->timer, blk_stat_timer_fn, 0); return cb; } void blk_stat_add_callback(struct request_queue *q, struct blk_stat_callback *cb) { unsigned int bucket; unsigned long flags; int cpu; for_each_possible_cpu(cpu) { struct blk_rq_stat *cpu_stat; cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); for (bucket = 0; bucket < cb->buckets; bucket++) blk_rq_stat_init(&cpu_stat[bucket]); } spin_lock_irqsave(&q->stats->lock, flags); list_add_tail_rcu(&cb->list, &q->stats->callbacks); blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } void blk_stat_remove_callback(struct request_queue *q, struct blk_stat_callback *cb) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); list_del_rcu(&cb->list); if (list_empty(&q->stats->callbacks) && !q->stats->accounting) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); del_timer_sync(&cb->timer); } static void blk_stat_free_callback_rcu(struct rcu_head *head) { struct blk_stat_callback *cb; cb = container_of(head, struct blk_stat_callback, rcu); free_percpu(cb->cpu_stat); kfree(cb->stat); kfree(cb); } void blk_stat_free_callback(struct blk_stat_callback *cb) { if (cb) call_rcu(&cb->rcu, blk_stat_free_callback_rcu); } void blk_stat_disable_accounting(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); if (!--q->stats->accounting && list_empty(&q->stats->callbacks)) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_disable_accounting); void blk_stat_enable_accounting(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); if (!q->stats->accounting++ && list_empty(&q->stats->callbacks)) blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); struct blk_queue_stats *blk_alloc_queue_stats(void) { struct blk_queue_stats *stats; stats = kmalloc(sizeof(*stats), GFP_KERNEL); if (!stats) return NULL; INIT_LIST_HEAD(&stats->callbacks); spin_lock_init(&stats->lock); stats->accounting = 0; return stats; } void blk_free_queue_stats(struct blk_queue_stats *stats) { if (!stats) return; WARN_ON(!list_empty(&stats->callbacks)); kfree(stats); } |
8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 | // SPDX-License-Identifier: GPL-2.0-or-later /* 6LoWPAN fragment reassembly * * Authors: * Alexander Aring <aar@pengutronix.de> * * Based on: net/ipv6/reassembly.c */ #define pr_fmt(fmt) "6LoWPAN: " fmt #include <linux/net.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/export.h> #include <net/ieee802154_netdev.h> #include <net/6lowpan.h> #include <net/ipv6_frag.h> #include <net/inet_frag.h> #include <net/ip.h> #include "6lowpan_i.h" static const char lowpan_frags_cache_name[] = "lowpan-frags"; static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev, struct net_device *ldev); static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { const struct frag_lowpan_compare_key *key = a; BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); memcpy(&q->key, key, sizeof(*key)); } static void lowpan_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; fq = container_of(frag, struct frag_queue, q); spin_lock(&fq->q.lock); if (fq->q.flags & INET_FRAG_COMPLETE) goto out; inet_frag_kill(&fq->q); out: spin_unlock(&fq->q.lock); inet_frag_put(&fq->q); } static inline struct lowpan_frag_queue * fq_find(struct net *net, const struct lowpan_802154_cb *cb, const struct ieee802154_addr *src, const struct ieee802154_addr *dst) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); struct frag_lowpan_compare_key key = {}; struct inet_frag_queue *q; key.tag = cb->d_tag; key.d_size = cb->d_size; key.src = *src; key.dst = *dst; q = inet_frag_find(ieee802154_lowpan->fqdir, &key); if (!q) return NULL; return container_of(q, struct lowpan_frag_queue, q); } static int lowpan_frag_queue(struct lowpan_frag_queue *fq, struct sk_buff *skb, u8 frag_type) { struct sk_buff *prev_tail; struct net_device *ldev; int end, offset, err; /* inet_frag_queue_* functions use skb->cb; see struct ipfrag_skb_cb * in inet_fragment.c */ BUILD_BUG_ON(sizeof(struct lowpan_802154_cb) > sizeof(struct inet_skb_parm)); BUILD_BUG_ON(sizeof(struct lowpan_802154_cb) > sizeof(struct inet6_skb_parm)); if (fq->q.flags & INET_FRAG_COMPLETE) goto err; offset = lowpan_802154_cb(skb)->d_offset << 3; end = lowpan_802154_cb(skb)->d_size; /* Is this the final fragment? */ if (offset + skb->len == end) { /* If we already have some bits beyond end * or have different end, the segment is corrupted. */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) goto err; fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) goto err; fq->q.len = end; } } ldev = skb->dev; if (ldev) skb->dev = NULL; barrier(); prev_tail = fq->q.fragments_tail; err = inet_frag_queue_insert(&fq->q, skb, offset, end); if (err) goto err; fq->q.stamp = skb->tstamp; fq->q.tstamp_type = skb->tstamp_type; if (frag_type == LOWPAN_DISPATCH_FRAG1) fq->q.flags |= INET_FRAG_FIRST_IN; fq->q.meat += skb->len; add_frag_mem_limit(fq->q.fqdir, skb->truesize); if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { int res; unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; res = lowpan_frag_reasm(fq, skb, prev_tail, ldev); skb->_skb_refdst = orefdst; return res; } skb_dst_drop(skb); return -1; err: kfree_skb(skb); return -1; } /* Check if this packet is complete. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. */ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *ldev) { void *reasm_data; inet_frag_kill(&fq->q); reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) goto out_oom; inet_frag_reasm_finish(&fq->q, skb, reasm_data, false); skb->dev = ldev; skb->tstamp = fq->q.stamp; fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; return 1; out_oom: net_dbg_ratelimited("lowpan_frag_reasm: no memory for reassembly\n"); return -1; } static int lowpan_frag_rx_handlers_result(struct sk_buff *skb, lowpan_rx_result res) { switch (res) { case RX_QUEUED: return NET_RX_SUCCESS; case RX_CONTINUE: /* nobody cared about this packet */ net_warn_ratelimited("%s: received unknown dispatch\n", __func__); fallthrough; default: /* all others failure */ return NET_RX_DROP; } } static lowpan_rx_result lowpan_frag_rx_h_iphc(struct sk_buff *skb) { int ret; if (!lowpan_is_iphc(*skb_network_header(skb))) return RX_CONTINUE; ret = lowpan_iphc_decompress(skb); if (ret < 0) return RX_DROP; return RX_QUEUED; } static int lowpan_invoke_frag_rx_handlers(struct sk_buff *skb) { lowpan_rx_result res; #define CALL_RXH(rxh) \ do { \ res = rxh(skb); \ if (res != RX_CONTINUE) \ goto rxh_next; \ } while (0) /* likely at first */ CALL_RXH(lowpan_frag_rx_h_iphc); CALL_RXH(lowpan_rx_h_ipv6); rxh_next: return lowpan_frag_rx_handlers_result(skb, res); #undef CALL_RXH } #define LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK 0x07 #define LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT 8 static int lowpan_get_cb(struct sk_buff *skb, u8 frag_type, struct lowpan_802154_cb *cb) { bool fail; u8 high = 0, low = 0; __be16 d_tag = 0; fail = lowpan_fetch_skb(skb, &high, 1); fail |= lowpan_fetch_skb(skb, &low, 1); /* remove the dispatch value and use first three bits as high value * for the datagram size */ cb->d_size = (high & LOWPAN_FRAG_DGRAM_SIZE_HIGH_MASK) << LOWPAN_FRAG_DGRAM_SIZE_HIGH_SHIFT | low; fail |= lowpan_fetch_skb(skb, &d_tag, 2); cb->d_tag = ntohs(d_tag); if (frag_type == LOWPAN_DISPATCH_FRAGN) { fail |= lowpan_fetch_skb(skb, &cb->d_offset, 1); } else { skb_reset_network_header(skb); cb->d_offset = 0; /* check if datagram_size has ipv6hdr on FRAG1 */ fail |= cb->d_size < sizeof(struct ipv6hdr); /* check if we can dereference the dispatch value */ fail |= !skb->len; } if (unlikely(fail)) return -EIO; return 0; } int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) { struct lowpan_frag_queue *fq; struct net *net = dev_net(skb->dev); struct lowpan_802154_cb *cb = lowpan_802154_cb(skb); struct ieee802154_hdr hdr = {}; int err; if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) goto err; err = lowpan_get_cb(skb, frag_type, cb); if (err < 0) goto err; if (frag_type == LOWPAN_DISPATCH_FRAG1) { err = lowpan_invoke_frag_rx_handlers(skb); if (err == NET_RX_DROP) goto err; } if (cb->d_size > IPV6_MIN_MTU) { net_warn_ratelimited("lowpan_frag_rcv: datagram size exceeds MTU\n"); goto err; } fq = fq_find(net, cb, &hdr.source, &hdr.dest); if (fq != NULL) { int ret; spin_lock(&fq->q.lock); ret = lowpan_frag_queue(fq, skb, frag_type); spin_unlock(&fq->q.lock); inet_frag_put(&fq->q); return ret; } err: kfree_skb(skb); return -1; } #ifdef CONFIG_SYSCTL static struct ctl_table lowpan_frags_ns_ctl_table[] = { { .procname = "6lowpanfrag_high_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "6lowpanfrag_low_thresh", .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "6lowpanfrag_time", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; /* secret interval has been deprecated */ static int lowpan_frags_secret_interval_unused; static struct ctl_table lowpan_frags_ctl_table[] = { { .procname = "6lowpanfrag_secret_interval", .data = &lowpan_frags_secret_interval_unused, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, }; static int __net_init lowpan_frags_ns_sysctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); size_t table_size = ARRAY_SIZE(lowpan_frags_ns_ctl_table); table = lowpan_frags_ns_ctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(lowpan_frags_ns_ctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) table_size = 0; } table[0].data = &ieee802154_lowpan->fqdir->high_thresh; table[0].extra1 = &ieee802154_lowpan->fqdir->low_thresh; table[1].data = &ieee802154_lowpan->fqdir->low_thresh; table[1].extra2 = &ieee802154_lowpan->fqdir->high_thresh; table[2].data = &ieee802154_lowpan->fqdir->timeout; hdr = register_net_sysctl_sz(net, "net/ieee802154/6lowpan", table, table_size); if (hdr == NULL) goto err_reg; ieee802154_lowpan->sysctl.frags_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net) { const struct ctl_table *table; struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); table = ieee802154_lowpan->sysctl.frags_hdr->ctl_table_arg; unregister_net_sysctl_table(ieee802154_lowpan->sysctl.frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } static struct ctl_table_header *lowpan_ctl_header; static int __init lowpan_frags_sysctl_register(void) { lowpan_ctl_header = register_net_sysctl(&init_net, "net/ieee802154/6lowpan", lowpan_frags_ctl_table); return lowpan_ctl_header == NULL ? -ENOMEM : 0; } static void lowpan_frags_sysctl_unregister(void) { unregister_net_sysctl_table(lowpan_ctl_header); } #else static inline int lowpan_frags_ns_sysctl_register(struct net *net) { return 0; } static inline void lowpan_frags_ns_sysctl_unregister(struct net *net) { } static inline int __init lowpan_frags_sysctl_register(void) { return 0; } static inline void lowpan_frags_sysctl_unregister(void) { } #endif static int __net_init lowpan_frags_init_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); int res; res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net); if (res < 0) return res; ieee802154_lowpan->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->fqdir->timeout = IPV6_FRAG_TIMEOUT; res = lowpan_frags_ns_sysctl_register(net); if (res < 0) fqdir_exit(ieee802154_lowpan->fqdir); return res; } static void __net_exit lowpan_frags_pre_exit_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); fqdir_pre_exit(ieee802154_lowpan->fqdir); } static void __net_exit lowpan_frags_exit_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); lowpan_frags_ns_sysctl_unregister(net); fqdir_exit(ieee802154_lowpan->fqdir); } static struct pernet_operations lowpan_frags_ops = { .init = lowpan_frags_init_net, .pre_exit = lowpan_frags_pre_exit_net, .exit = lowpan_frags_exit_net, }; static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) { return jhash2(data, sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); } static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed) { const struct inet_frag_queue *fq = data; return jhash2((const u32 *)&fq->key, sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); } static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) { const struct frag_lowpan_compare_key *key = arg->key; const struct inet_frag_queue *fq = ptr; return !!memcmp(&fq->key, key, sizeof(*key)); } static const struct rhashtable_params lowpan_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), .hashfn = lowpan_key_hashfn, .obj_hashfn = lowpan_obj_hashfn, .obj_cmpfn = lowpan_obj_cmpfn, .automatic_shrinking = true, }; int __init lowpan_net_frag_init(void) { int ret; lowpan_frags.constructor = lowpan_frag_init; lowpan_frags.destructor = NULL; lowpan_frags.qsize = sizeof(struct frag_queue); lowpan_frags.frag_expire = lowpan_frag_expire; lowpan_frags.frags_cache_name = lowpan_frags_cache_name; lowpan_frags.rhash_params = lowpan_rhash_params; ret = inet_frags_init(&lowpan_frags); if (ret) goto out; ret = lowpan_frags_sysctl_register(); if (ret) goto err_sysctl; ret = register_pernet_subsys(&lowpan_frags_ops); if (ret) goto err_pernet; out: return ret; err_pernet: lowpan_frags_sysctl_unregister(); err_sysctl: inet_frags_fini(&lowpan_frags); return ret; } void lowpan_net_frag_exit(void) { lowpan_frags_sysctl_unregister(); unregister_pernet_subsys(&lowpan_frags_ops); inet_frags_fini(&lowpan_frags); } |
17 8 16 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 | /* * net/tipc/discover.c * * Copyright (c) 2003-2006, 2014-2018, Ericsson AB * Copyright (c) 2005-2006, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "node.h" #include "discover.h" /* min delay during bearer start up */ #define TIPC_DISC_INIT msecs_to_jiffies(125) /* max delay if bearer has no links */ #define TIPC_DISC_FAST msecs_to_jiffies(1000) /* max delay if bearer has links */ #define TIPC_DISC_SLOW msecs_to_jiffies(60000) /* indicates no timer in use */ #define TIPC_DISC_INACTIVE 0xffffffff /** * struct tipc_discoverer - information about an ongoing link setup request * @bearer_id: identity of bearer issuing requests * @net: network namespace instance * @dest: destination address for request messages * @domain: network domain to which links can be established * @num_nodes: number of nodes currently discovered (i.e. with an active link) * @lock: spinlock for controlling access to requests * @skb: request message to be (repeatedly) sent * @timer: timer governing period between requests * @timer_intv: current interval between requests (in ms) */ struct tipc_discoverer { u32 bearer_id; struct tipc_media_addr dest; struct net *net; u32 domain; int num_nodes; spinlock_t lock; struct sk_buff *skb; struct timer_list timer; unsigned long timer_intv; }; /** * tipc_disc_init_msg - initialize a link setup message * @net: the applicable net namespace * @skb: buffer containing message * @mtyp: message type (request or response) * @b: ptr to bearer issuing message */ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, u32 mtyp, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); u32 dest_domain = b->domain; struct tipc_msg *hdr; hdr = buf_msg(skb); tipc_msg_init(tn->trial_addr, hdr, LINK_CONFIG, mtyp, MAX_H_SIZE, dest_domain); msg_set_size(hdr, MAX_H_SIZE + NODE_ID_LEN); msg_set_non_seq(hdr, 1); msg_set_node_sig(hdr, tn->random); msg_set_node_capabilities(hdr, TIPC_NODE_CAPABILITIES); msg_set_dest_domain(hdr, dest_domain); msg_set_bc_netid(hdr, tn->net_id); b->media->addr2msg(msg_media_addr(hdr), &b->addr); msg_set_peer_net_hash(hdr, tipc_net_hash_mixes(net, tn->random)); msg_set_node_id(hdr, tipc_own_id(net)); } static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src, u32 sugg_addr, struct tipc_media_addr *maddr, struct tipc_bearer *b) { struct tipc_msg *hdr; struct sk_buff *skb; skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); if (!skb) return; hdr = buf_msg(skb); tipc_disc_init_msg(net, skb, mtyp, b); msg_set_sugg_node_addr(hdr, sugg_addr); msg_set_dest_domain(hdr, dst); tipc_bearer_xmit_skb(net, b->identity, skb, maddr); } /** * disc_dupl_alert - issue node address duplication alert * @b: pointer to bearer detecting duplication * @node_addr: duplicated node address * @media_addr: media address advertised by duplicated node */ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, struct tipc_media_addr *media_addr) { char media_addr_str[64]; tipc_media_addr_printf(media_addr_str, sizeof(media_addr_str), media_addr); pr_warn("Duplicate %x using %s seen on <%s>\n", node_addr, media_addr_str, b->name); } /* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer * Returns true if message should be dropped by caller, i.e., if it is a * trial message or we are inside trial period. Otherwise false. */ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, struct tipc_media_addr *maddr, struct tipc_bearer *b, u32 dst, u32 src, u32 sugg_addr, u8 *peer_id, int mtyp) { struct net *net = d->net; struct tipc_net *tn = tipc_net(net); u32 self = tipc_own_addr(net); bool trial = time_before(jiffies, tn->addr_trial_end) && !self; if (mtyp == DSC_TRIAL_FAIL_MSG) { if (!trial) return true; /* Ignore if somebody else already gave new suggestion */ if (dst != tn->trial_addr) return true; /* Otherwise update trial address and restart trial period */ tn->trial_addr = sugg_addr; msg_set_prevnode(buf_msg(d->skb), sugg_addr); tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); return true; } /* Apply trial address if we just left trial period */ if (!trial && !self) { schedule_work(&tn->work); msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); } /* Accept regular link requests/responses only after trial period */ if (mtyp != DSC_TRIAL_MSG) return trial; sugg_addr = tipc_node_try_addr(net, peer_id, src); if (sugg_addr) tipc_disc_msg_xmit(net, DSC_TRIAL_FAIL_MSG, src, self, sugg_addr, maddr, b); return true; } /** * tipc_disc_rcv - handle incoming discovery message (request or response) * @net: applicable net namespace * @skb: buffer containing message * @b: bearer that message arrived on */ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); struct tipc_msg *hdr = buf_msg(skb); u32 pnet_hash = msg_peer_net_hash(hdr); u16 caps = msg_node_capabilities(hdr); bool legacy = tn->legacy_addr_format; u32 sugg = msg_sugg_node_addr(hdr); u32 signature = msg_node_sig(hdr); u8 peer_id[NODE_ID_LEN] = {0,}; u32 dst = msg_dest_domain(hdr); u32 net_id = msg_bc_netid(hdr); struct tipc_media_addr maddr; u32 src = msg_prevnode(hdr); u32 mtyp = msg_type(hdr); bool dupl_addr = false; bool respond = false; u32 self; int err; if (skb_linearize(skb)) { kfree_skb(skb); return; } hdr = buf_msg(skb); if (caps & TIPC_NODE_ID128) memcpy(peer_id, msg_node_id(hdr), NODE_ID_LEN); else sprintf(peer_id, "%x", src); err = b->media->msg2addr(b, &maddr, msg_media_addr(hdr)); kfree_skb(skb); if (err || maddr.broadcast) { pr_warn_ratelimited("Rcv corrupt discovery message\n"); return; } /* Ignore discovery messages from own node */ if (!memcmp(&maddr, &b->addr, sizeof(maddr))) return; if (net_id != tn->net_id) return; if (tipc_disc_addr_trial_msg(b->disc, &maddr, b, dst, src, sugg, peer_id, mtyp)) return; self = tipc_own_addr(net); /* Message from somebody using this node's address */ if (in_own_node(net, src)) { disc_dupl_alert(b, self, &maddr); return; } if (!tipc_in_scope(legacy, dst, self)) return; if (!tipc_in_scope(legacy, b->domain, src)) return; tipc_node_check_dest(net, src, peer_id, b, caps, signature, pnet_hash, &maddr, &respond, &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); if (!respond) return; if (mtyp != DSC_REQ_MSG) return; tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, 0, &maddr, b); } /* tipc_disc_add_dest - increment set of discovered nodes */ void tipc_disc_add_dest(struct tipc_discoverer *d) { spin_lock_bh(&d->lock); d->num_nodes++; spin_unlock_bh(&d->lock); } /* tipc_disc_remove_dest - decrement set of discovered nodes */ void tipc_disc_remove_dest(struct tipc_discoverer *d) { int intv, num; spin_lock_bh(&d->lock); d->num_nodes--; num = d->num_nodes; intv = d->timer_intv; if (!num && (intv == TIPC_DISC_INACTIVE || intv > TIPC_DISC_FAST)) { d->timer_intv = TIPC_DISC_INIT; mod_timer(&d->timer, jiffies + d->timer_intv); } spin_unlock_bh(&d->lock); } /* tipc_disc_timeout - send a periodic link setup request * Called whenever a link setup request timer associated with a bearer expires. * - Keep doubling time between sent request until limit is reached; * - Hold at fast polling rate if we don't have any associated nodes * - Otherwise hold at slow polling rate */ static void tipc_disc_timeout(struct timer_list *t) { struct tipc_discoverer *d = from_timer(d, t, timer); struct tipc_net *tn = tipc_net(d->net); struct tipc_media_addr maddr; struct sk_buff *skb = NULL; struct net *net = d->net; u32 bearer_id; spin_lock_bh(&d->lock); /* Stop searching if only desired node has been found */ if (tipc_node(d->domain) && d->num_nodes) { d->timer_intv = TIPC_DISC_INACTIVE; goto exit; } /* Did we just leave trial period ? */ if (!time_before(jiffies, tn->addr_trial_end) && !tipc_own_addr(net)) { mod_timer(&d->timer, jiffies + TIPC_DISC_INIT); spin_unlock_bh(&d->lock); schedule_work(&tn->work); return; } /* Adjust timeout interval according to discovery phase */ if (time_before(jiffies, tn->addr_trial_end)) { d->timer_intv = TIPC_DISC_INIT; } else { d->timer_intv *= 2; if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW) d->timer_intv = TIPC_DISC_SLOW; else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST) d->timer_intv = TIPC_DISC_FAST; msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); } mod_timer(&d->timer, jiffies + d->timer_intv); memcpy(&maddr, &d->dest, sizeof(maddr)); skb = skb_clone(d->skb, GFP_ATOMIC); bearer_id = d->bearer_id; exit: spin_unlock_bh(&d->lock); if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &maddr); } /** * tipc_disc_create - create object to send periodic link setup requests * @net: the applicable net namespace * @b: ptr to bearer issuing requests * @dest: destination address for request messages * @skb: pointer to created frame * * Return: 0 if successful, otherwise -errno. */ int tipc_disc_create(struct net *net, struct tipc_bearer *b, struct tipc_media_addr *dest, struct sk_buff **skb) { struct tipc_net *tn = tipc_net(net); struct tipc_discoverer *d; d = kmalloc(sizeof(*d), GFP_ATOMIC); if (!d) return -ENOMEM; d->skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); if (!d->skb) { kfree(d); return -ENOMEM; } tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); /* Do we need an address trial period first ? */ if (!tipc_own_addr(net)) { tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); msg_set_type(buf_msg(d->skb), DSC_TRIAL_MSG); } memcpy(&d->dest, dest, sizeof(*dest)); d->net = net; d->bearer_id = b->identity; d->domain = b->domain; d->num_nodes = 0; d->timer_intv = TIPC_DISC_INIT; spin_lock_init(&d->lock); timer_setup(&d->timer, tipc_disc_timeout, 0); mod_timer(&d->timer, jiffies + d->timer_intv); b->disc = d; *skb = skb_clone(d->skb, GFP_ATOMIC); return 0; } /** * tipc_disc_delete - destroy object sending periodic link setup requests * @d: ptr to link dest structure */ void tipc_disc_delete(struct tipc_discoverer *d) { timer_shutdown_sync(&d->timer); kfree_skb(d->skb); kfree(d); } /** * tipc_disc_reset - reset object to send periodic link setup requests * @net: the applicable net namespace * @b: ptr to bearer issuing requests */ void tipc_disc_reset(struct net *net, struct tipc_bearer *b) { struct tipc_discoverer *d = b->disc; struct tipc_media_addr maddr; struct sk_buff *skb; spin_lock_bh(&d->lock); tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); d->net = net; d->bearer_id = b->identity; d->domain = b->domain; d->num_nodes = 0; d->timer_intv = TIPC_DISC_INIT; memcpy(&maddr, &d->dest, sizeof(maddr)); mod_timer(&d->timer, jiffies + d->timer_intv); skb = skb_clone(d->skb, GFP_ATOMIC); spin_unlock_bh(&d->lock); if (skb) tipc_bearer_xmit_skb(net, b->identity, skb, &maddr); } |
169 167 163 15 3 165 3 12 13 13 189 117 111 109 117 117 182 197 70 158 197 4 1 2 3 3 149 150 41 133 35 35 5 5 167 165 165 10 167 2 2 1 1 1 2 10 5 5 10 7 10 10 150 151 150 50 167 42 150 42 7 36 40 127 118 15 1 126 151 45 168 10 2 9 180 180 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2003 * Copyright (c) Cisco 1999,2000 * Copyright (c) Motorola 1999,2000,2001 * Copyright (c) La Monte H.P. Yarroll 2001 * * This file is part of the SCTP kernel implementation. * * A collection class to handle the storage of transport addresses. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> */ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/if_inet6.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* Forward declarations for internal helpers. */ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, union sctp_addr *addr, enum sctp_scope scope, gfp_t gfp, int flags); static void sctp_bind_addr_clean(struct sctp_bind_addr *); /* First Level Abstractions. */ /* Copy 'src' to 'dest' taking 'scope' into account. Omit addresses * in 'src' which have a broader scope than 'scope'. */ int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, enum sctp_scope scope, gfp_t gfp, int flags) { struct sctp_sockaddr_entry *addr; int error = 0; /* All addresses share the same port. */ dest->port = src->port; /* Extract the addresses which are relevant for this scope. */ list_for_each_entry(addr, &src->address_list, list) { error = sctp_copy_one_addr(net, dest, &addr->a, scope, gfp, flags); if (error < 0) goto out; } /* If there are no addresses matching the scope and * this is global scope, try to get a link scope address, with * the assumption that we must be sitting behind a NAT. */ if (list_empty(&dest->address_list) && (SCTP_SCOPE_GLOBAL == scope)) { list_for_each_entry(addr, &src->address_list, list) { error = sctp_copy_one_addr(net, dest, &addr->a, SCTP_SCOPE_LINK, gfp, flags); if (error < 0) goto out; } } /* If somehow no addresses were found that can be used with this * scope, it's an error. */ if (list_empty(&dest->address_list)) error = -ENETUNREACH; out: if (error) sctp_bind_addr_clean(dest); return error; } /* Exactly duplicate the address lists. This is necessary when doing * peer-offs and accepts. We don't want to put all the current system * addresses into the endpoint. That's useless. But we do want duplicat * the list of bound addresses that the older endpoint used. */ int sctp_bind_addr_dup(struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, gfp_t gfp) { struct sctp_sockaddr_entry *addr; int error = 0; /* All addresses share the same port. */ dest->port = src->port; list_for_each_entry(addr, &src->address_list, list) { error = sctp_add_bind_addr(dest, &addr->a, sizeof(addr->a), 1, gfp); if (error < 0) break; } return error; } /* Initialize the SCTP_bind_addr structure for either an endpoint or * an association. */ void sctp_bind_addr_init(struct sctp_bind_addr *bp, __u16 port) { INIT_LIST_HEAD(&bp->address_list); bp->port = port; } /* Dispose of the address list. */ static void sctp_bind_addr_clean(struct sctp_bind_addr *bp) { struct sctp_sockaddr_entry *addr, *temp; /* Empty the bind address list. */ list_for_each_entry_safe(addr, temp, &bp->address_list, list) { list_del_rcu(&addr->list); kfree_rcu(addr, rcu); SCTP_DBG_OBJCNT_DEC(addr); } } /* Dispose of an SCTP_bind_addr structure */ void sctp_bind_addr_free(struct sctp_bind_addr *bp) { /* Empty the bind address list. */ sctp_bind_addr_clean(bp); } /* Add an address to the bind address list in the SCTP_bind_addr structure. */ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, int new_size, __u8 addr_state, gfp_t gfp) { struct sctp_sockaddr_entry *addr; /* Add the address to the bind address list. */ addr = kzalloc(sizeof(*addr), gfp); if (!addr) return -ENOMEM; memcpy(&addr->a, new, min_t(size_t, sizeof(*new), new_size)); /* Fix up the port if it has not yet been set. * Both v4 and v6 have the port at the same offset. */ if (!addr->a.v4.sin_port) addr->a.v4.sin_port = htons(bp->port); addr->state = addr_state; addr->valid = 1; INIT_LIST_HEAD(&addr->list); /* We always hold a socket lock when calling this function, * and that acts as a writer synchronizing lock. */ list_add_tail_rcu(&addr->list, &bp->address_list); SCTP_DBG_OBJCNT_INC(addr); return 0; } /* Delete an address from the bind address list in the SCTP_bind_addr * structure. */ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr) { struct sctp_sockaddr_entry *addr, *temp; int found = 0; /* We hold the socket lock when calling this function, * and that acts as a writer synchronizing lock. */ list_for_each_entry_safe(addr, temp, &bp->address_list, list) { if (sctp_cmp_addr_exact(&addr->a, del_addr)) { /* Found the exact match. */ found = 1; addr->valid = 0; list_del_rcu(&addr->list); break; } } if (found) { kfree_rcu(addr, rcu); SCTP_DBG_OBJCNT_DEC(addr); return 0; } return -EINVAL; } /* Create a network byte-order representation of all the addresses * formated as SCTP parameters. * * The second argument is the return value for the length. */ union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, int *addrs_len, gfp_t gfp) { union sctp_params addrparms; union sctp_params retval; int addrparms_len; union sctp_addr_param rawaddr; int len; struct sctp_sockaddr_entry *addr; struct list_head *pos; struct sctp_af *af; addrparms_len = 0; len = 0; /* Allocate enough memory at once. */ list_for_each(pos, &bp->address_list) { len += sizeof(union sctp_addr_param); } /* Don't even bother embedding an address if there * is only one. */ if (len == sizeof(union sctp_addr_param)) { retval.v = NULL; goto end_raw; } retval.v = kmalloc(len, gfp); if (!retval.v) goto end_raw; addrparms = retval; list_for_each_entry(addr, &bp->address_list, list) { af = sctp_get_af_specific(addr->a.v4.sin_family); len = af->to_addr_param(&addr->a, &rawaddr); memcpy(addrparms.v, &rawaddr, len); addrparms.v += len; addrparms_len += len; } end_raw: *addrs_len = addrparms_len; return retval; } /* * Create an address list out of the raw address list format (IPv4 and IPv6 * address parameters). */ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, int addrs_len, __u16 port, gfp_t gfp) { union sctp_addr_param *rawaddr; struct sctp_paramhdr *param; union sctp_addr addr; int retval = 0; int len; struct sctp_af *af; /* Convert the raw address to standard address format */ while (addrs_len) { param = (struct sctp_paramhdr *)raw_addr_list; rawaddr = (union sctp_addr_param *)raw_addr_list; af = sctp_get_af_specific(param_type2af(param->type)); if (unlikely(!af) || !af->from_addr_param(&addr, rawaddr, htons(port), 0)) { retval = -EINVAL; goto out_err; } if (sctp_bind_addr_state(bp, &addr) != -1) goto next; retval = sctp_add_bind_addr(bp, &addr, sizeof(addr), SCTP_ADDR_SRC, gfp); if (retval) /* Can't finish building the list, clean up. */ goto out_err; next: len = ntohs(param->length); addrs_len -= len; raw_addr_list += len; } return retval; out_err: if (retval) sctp_bind_addr_clean(bp); return retval; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Does this contain a specified address? Allow wildcarding. */ int sctp_bind_addr_match(struct sctp_bind_addr *bp, const union sctp_addr *addr, struct sctp_sock *opt) { struct sctp_sockaddr_entry *laddr; int match = 0; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; if (opt->pf->cmp_addr(&laddr->a, addr, opt)) { match = 1; break; } } rcu_read_unlock(); return match; } int sctp_bind_addrs_check(struct sctp_sock *sp, struct sctp_sock *sp2, int cnt2) { struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr; struct sctp_bind_addr *bp = &sp->ep->base.bind_addr; struct sctp_sockaddr_entry *laddr, *laddr2; bool exist = false; int cnt = 0; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { list_for_each_entry_rcu(laddr2, &bp2->address_list, list) { if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) && laddr->valid && laddr2->valid) { exist = true; goto next; } } cnt = 0; break; next: cnt++; } rcu_read_unlock(); return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1); } /* Does the address 'addr' conflict with any addresses in * the bp. */ int sctp_bind_addr_conflict(struct sctp_bind_addr *bp, const union sctp_addr *addr, struct sctp_sock *bp_sp, struct sctp_sock *addr_sp) { struct sctp_sockaddr_entry *laddr; int conflict = 0; struct sctp_sock *sp; /* Pick the IPv6 socket as the basis of comparison * since it's usually a superset of the IPv4. * If there is no IPv6 socket, then default to bind_addr. */ if (sctp_opt2sk(bp_sp)->sk_family == AF_INET6) sp = bp_sp; else if (sctp_opt2sk(addr_sp)->sk_family == AF_INET6) sp = addr_sp; else sp = bp_sp; rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; conflict = sp->pf->cmp_addr(&laddr->a, addr, sp); if (conflict) break; } rcu_read_unlock(); return conflict; } /* Get the state of the entry in the bind_addr_list */ int sctp_bind_addr_state(const struct sctp_bind_addr *bp, const union sctp_addr *addr) { struct sctp_sockaddr_entry *laddr; struct sctp_af *af; af = sctp_get_af_specific(addr->sa.sa_family); if (unlikely(!af)) return -1; list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; if (af->cmp_addr(&laddr->a, addr)) return laddr->state; } return -1; } /* Find the first address in the bind address list that is not present in * the addrs packed array. */ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp, const union sctp_addr *addrs, int addrcnt, struct sctp_sock *opt) { struct sctp_sockaddr_entry *laddr; union sctp_addr *addr; void *addr_buf; struct sctp_af *af; int i; /* This is only called sctp_send_asconf_del_ip() and we hold * the socket lock in that code patch, so that address list * can't change. */ list_for_each_entry(laddr, &bp->address_list, list) { addr_buf = (union sctp_addr *)addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); if (!af) break; if (opt->pf->cmp_addr(&laddr->a, addr, opt)) break; addr_buf += af->sockaddr_len; } if (i == addrcnt) return &laddr->a; } return NULL; } /* Copy out addresses from the global local address list. */ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest, union sctp_addr *addr, enum sctp_scope scope, gfp_t gfp, int flags) { int error = 0; if (sctp_is_any(NULL, addr)) { error = sctp_copy_local_addr_list(net, dest, scope, gfp, flags); } else if (sctp_in_scope(net, addr, scope)) { /* Now that the address is in scope, check to see if * the address type is supported by local sock as * well as the remote peer. */ if ((((AF_INET == addr->sa.sa_family) && (flags & SCTP_ADDR4_ALLOWED) && (flags & SCTP_ADDR4_PEERSUPP))) || (((AF_INET6 == addr->sa.sa_family) && (flags & SCTP_ADDR6_ALLOWED) && (flags & SCTP_ADDR6_PEERSUPP)))) error = sctp_add_bind_addr(dest, addr, sizeof(*addr), SCTP_ADDR_SRC, gfp); } return error; } /* Is this a wildcard address? */ int sctp_is_any(struct sock *sk, const union sctp_addr *addr) { unsigned short fam = 0; struct sctp_af *af; /* Try to get the right address family */ if (addr->sa.sa_family != AF_UNSPEC) fam = addr->sa.sa_family; else if (sk) fam = sk->sk_family; af = sctp_get_af_specific(fam); if (!af) return 0; return af->is_any(addr); } /* Is 'addr' valid for 'scope'? */ int sctp_in_scope(struct net *net, const union sctp_addr *addr, enum sctp_scope scope) { enum sctp_scope addr_scope = sctp_scope(addr); /* The unusable SCTP addresses will not be considered with * any defined scopes. */ if (SCTP_SCOPE_UNUSABLE == addr_scope) return 0; /* * For INIT and INIT-ACK address list, let L be the level of * requested destination address, sender and receiver * SHOULD include all of its addresses with level greater * than or equal to L. * * Address scoping can be selectively controlled via sysctl * option */ switch (net->sctp.scope_policy) { case SCTP_SCOPE_POLICY_DISABLE: return 1; case SCTP_SCOPE_POLICY_ENABLE: if (addr_scope <= scope) return 1; break; case SCTP_SCOPE_POLICY_PRIVATE: if (addr_scope <= scope || SCTP_SCOPE_PRIVATE == addr_scope) return 1; break; case SCTP_SCOPE_POLICY_LINK: if (addr_scope <= scope || SCTP_SCOPE_LINK == addr_scope) return 1; break; default: break; } return 0; } int sctp_is_ep_boundall(struct sock *sk) { struct sctp_bind_addr *bp; struct sctp_sockaddr_entry *addr; bp = &sctp_sk(sk)->ep->base.bind_addr; if (sctp_list_single_entry(&bp->address_list)) { addr = list_entry(bp->address_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(sk, &addr->a)) return 1; } return 0; } /******************************************************************** * 3rd Level Abstractions ********************************************************************/ /* What is the scope of 'addr'? */ enum sctp_scope sctp_scope(const union sctp_addr *addr) { struct sctp_af *af; af = sctp_get_af_specific(addr->sa.sa_family); if (!af) return SCTP_SCOPE_UNUSABLE; return af->scope((union sctp_addr *)addr); } |
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 | // SPDX-License-Identifier: GPL-2.0-only /* * sd.c Copyright (C) 1992 Drew Eckhardt * Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale * * Linux scsi disk driver * Initial versions: Drew Eckhardt * Subsequent revisions: Eric Youngdale * Modification history: * - Drew Eckhardt <drew@colorado.edu> original * - Eric Youngdale <eric@andante.org> add scatter-gather, multiple * outstanding request, and other enhancements. * Support loadable low-level scsi drivers. * - Jirka Hanika <geo@ff.cuni.cz> support more scsi disks using * eight major numbers. * - Richard Gooch <rgooch@atnf.csiro.au> support devfs. * - Torben Mathiasen <tmm@image.dk> Resource allocation fixes in * sd_init and cleanups. * - Alex Davis <letmein@erols.com> Fix problem where partition info * not being read in sd_open. Fix problem where removable media * could be ejected after sd_open. * - Douglas Gilbert <dgilbert@interlog.com> cleanup for lk 2.5.x * - Badari Pulavarty <pbadari@us.ibm.com>, Matthew Wilcox * <willy@debian.org>, Kurt Garloff <garloff@suse.de>: * Support 32k/1M disks. * * Logging policy (needs CONFIG_SCSI_LOGGING defined): * - setting up transfer: SCSI_LOG_HLQUEUE levels 1 and 2 * - end of transfer (bh + scsi_lib): SCSI_LOG_HLCOMPLETE level 1 * - entering sd_ioctl: SCSI_LOG_IOCTL level 1 * - entering other commands: SCSI_LOG_HLQUEUE level 3 * Note: when the logging level is set by the user, it must be greater * than the level indicated above to trigger output. */ #include <linux/bio-integrity.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/hdreg.h> #include <linux/errno.h> #include <linux/idr.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/blkpg.h> #include <linux/blk-pm.h> #include <linux/delay.h> #include <linux/rw_hint.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/string_helpers.h> #include <linux/slab.h> #include <linux/sed-opal.h> #include <linux/pm_runtime.h> #include <linux/pr.h> #include <linux/t10-pi.h> #include <linux/uaccess.h> #include <linux/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_devinfo.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsicam.h> #include <scsi/scsi_common.h> #include "sd.h" #include "scsi_priv.h" #include "scsi_logging.h" MODULE_AUTHOR("Eric Youngdale"); MODULE_DESCRIPTION("SCSI disk (sd) driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK0_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK1_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK2_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK3_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK4_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK5_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK6_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK7_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK8_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK9_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK10_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK11_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK12_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK13_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK14_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK15_MAJOR); MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK); MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD); MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC); MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC); #define SD_MINORS 16 static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, unsigned int mode); static void sd_config_write_same(struct scsi_disk *sdkp, struct queue_limits *lim); static int sd_revalidate_disk(struct gendisk *); static void sd_unlock_native_capacity(struct gendisk *disk); static void sd_shutdown(struct device *); static void scsi_disk_release(struct device *cdev); static DEFINE_IDA(sd_index_ida); static mempool_t *sd_page_pool; static struct lock_class_key sd_bio_compl_lkclass; static const char *sd_cache_types[] = { "write through", "none", "write back", "write back, no read (daft)" }; static void sd_set_flush_flag(struct scsi_disk *sdkp, struct queue_limits *lim) { if (sdkp->WCE) { lim->features |= BLK_FEAT_WRITE_CACHE; if (sdkp->DPOFUA) lim->features |= BLK_FEAT_FUA; else lim->features &= ~BLK_FEAT_FUA; } else { lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA); } } static ssize_t cache_type_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ct, rcd, wce, sp; struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; char buffer[64]; char *buffer_data; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; static const char temp[] = "temporary "; int len, ret; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) /* no cache control on RBC devices; theoretically they * can do it, but there's probably so many exceptions * it's not worth the risk */ return -EINVAL; if (strncmp(buf, temp, sizeof(temp) - 1) == 0) { buf += sizeof(temp) - 1; sdkp->cache_override = 1; } else { sdkp->cache_override = 0; } ct = sysfs_match_string(sd_cache_types, buf); if (ct < 0) return -EINVAL; rcd = ct & 0x01 ? 1 : 0; wce = (ct & 0x02) && !sdkp->write_prot ? 1 : 0; if (sdkp->cache_override) { struct queue_limits lim; sdkp->WCE = wce; sdkp->RCD = rcd; lim = queue_limits_start_update(sdkp->disk->queue); sd_set_flush_flag(sdkp, &lim); blk_mq_freeze_queue(sdkp->disk->queue); ret = queue_limits_commit_update(sdkp->disk->queue, &lim); blk_mq_unfreeze_queue(sdkp->disk->queue); if (ret) return ret; return count; } if (scsi_mode_sense(sdp, 0x08, 8, 0, buffer, sizeof(buffer), SD_TIMEOUT, sdkp->max_retries, &data, NULL)) return -EINVAL; len = min_t(size_t, sizeof(buffer), data.length - data.header_length - data.block_descriptor_length); buffer_data = buffer + data.header_length + data.block_descriptor_length; buffer_data[2] &= ~0x05; buffer_data[2] |= wce << 2 | rcd; sp = buffer_data[0] & 0x80 ? 1 : 0; buffer_data[0] &= ~0x80; /* * Ensure WP, DPOFUA, and RESERVED fields are cleared in * received mode parameter buffer before doing MODE SELECT. */ data.device_specific = 0; ret = scsi_mode_select(sdp, 1, sp, buffer_data, len, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (ret) { if (ret > 0 && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); return -EINVAL; } sd_revalidate_disk(sdkp->disk); return count; } static ssize_t manage_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_system_start_stop && sdp->manage_runtime_start_stop && sdp->manage_shutdown); } static DEVICE_ATTR_RO(manage_start_stop); static ssize_t manage_system_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_system_start_stop); } static ssize_t manage_system_start_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_system_start_stop = v; return count; } static DEVICE_ATTR_RW(manage_system_start_stop); static ssize_t manage_runtime_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_runtime_start_stop); } static ssize_t manage_runtime_start_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_runtime_start_stop = v; return count; } static DEVICE_ATTR_RW(manage_runtime_start_stop); static ssize_t manage_shutdown_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sysfs_emit(buf, "%u\n", sdp->manage_shutdown); } static ssize_t manage_shutdown_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_shutdown = v; return count; } static DEVICE_ATTR_RW(manage_shutdown); static ssize_t allow_restart_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->device->allow_restart); } static ssize_t allow_restart_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { bool v; struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return -EINVAL; if (kstrtobool(buf, &v)) return -EINVAL; sdp->allow_restart = v; return count; } static DEVICE_ATTR_RW(allow_restart); static ssize_t cache_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); int ct = sdkp->RCD + 2*sdkp->WCE; return sprintf(buf, "%s\n", sd_cache_types[ct]); } static DEVICE_ATTR_RW(cache_type); static ssize_t FUA_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->DPOFUA); } static DEVICE_ATTR_RO(FUA); static ssize_t protection_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->protection_type); } static ssize_t protection_type_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); unsigned int val; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; err = kstrtouint(buf, 10, &val); if (err) return err; if (val <= T10_PI_TYPE3_PROTECTION) sdkp->protection_type = val; return count; } static DEVICE_ATTR_RW(protection_type); static ssize_t protection_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; unsigned int dif, dix; dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type); dix = scsi_host_dix_capable(sdp->host, sdkp->protection_type); if (!dix && scsi_host_dix_capable(sdp->host, T10_PI_TYPE0_PROTECTION)) { dif = 0; dix = 1; } if (!dif && !dix) return sprintf(buf, "none\n"); return sprintf(buf, "%s%u\n", dix ? "dix" : "dif", dif); } static DEVICE_ATTR_RO(protection_mode); static ssize_t app_tag_own_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->ATO); } static DEVICE_ATTR_RO(app_tag_own); static ssize_t thin_provisioning_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->lbpme); } static DEVICE_ATTR_RO(thin_provisioning); /* sysfs_match_string() requires dense arrays */ static const char *lbp_mode[] = { [SD_LBP_FULL] = "full", [SD_LBP_UNMAP] = "unmap", [SD_LBP_WS16] = "writesame_16", [SD_LBP_WS10] = "writesame_10", [SD_LBP_ZERO] = "writesame_zero", [SD_LBP_DISABLE] = "disabled", }; static ssize_t provisioning_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%s\n", lbp_mode[sdkp->provisioning_mode]); } static ssize_t provisioning_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; struct queue_limits lim; int mode, err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK) return -EINVAL; mode = sysfs_match_string(lbp_mode, buf); if (mode < 0) return -EINVAL; lim = queue_limits_start_update(sdkp->disk->queue); sd_config_discard(sdkp, &lim, mode); blk_mq_freeze_queue(sdkp->disk->queue); err = queue_limits_commit_update(sdkp->disk->queue, &lim); blk_mq_unfreeze_queue(sdkp->disk->queue); if (err) return err; return count; } static DEVICE_ATTR_RW(provisioning_mode); /* sysfs_match_string() requires dense arrays */ static const char *zeroing_mode[] = { [SD_ZERO_WRITE] = "write", [SD_ZERO_WS] = "writesame", [SD_ZERO_WS16_UNMAP] = "writesame_16_unmap", [SD_ZERO_WS10_UNMAP] = "writesame_10_unmap", }; static ssize_t zeroing_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%s\n", zeroing_mode[sdkp->zeroing_mode]); } static ssize_t zeroing_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); int mode; if (!capable(CAP_SYS_ADMIN)) return -EACCES; mode = sysfs_match_string(zeroing_mode, buf); if (mode < 0) return -EINVAL; sdkp->zeroing_mode = mode; return count; } static DEVICE_ATTR_RW(zeroing_mode); static ssize_t max_medium_access_timeouts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->max_medium_access_timeouts); } static ssize_t max_medium_access_timeouts_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; err = kstrtouint(buf, 10, &sdkp->max_medium_access_timeouts); return err ? err : count; } static DEVICE_ATTR_RW(max_medium_access_timeouts); static ssize_t max_write_same_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->max_ws_blocks); } static ssize_t max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; struct queue_limits lim; unsigned long max; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return -EINVAL; err = kstrtoul(buf, 10, &max); if (err) return err; if (max == 0) sdp->no_write_same = 1; else if (max <= SD_MAX_WS16_BLOCKS) { sdp->no_write_same = 0; sdkp->max_ws_blocks = max; } lim = queue_limits_start_update(sdkp->disk->queue); sd_config_write_same(sdkp, &lim); blk_mq_freeze_queue(sdkp->disk->queue); err = queue_limits_commit_update(sdkp->disk->queue, &lim); blk_mq_unfreeze_queue(sdkp->disk->queue); if (err) return err; return count; } static DEVICE_ATTR_RW(max_write_same_blocks); static ssize_t zoned_cap_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); if (sdkp->device->type == TYPE_ZBC) return sprintf(buf, "host-managed\n"); if (sdkp->zoned == 1) return sprintf(buf, "host-aware\n"); if (sdkp->zoned == 2) return sprintf(buf, "drive-managed\n"); return sprintf(buf, "none\n"); } static DEVICE_ATTR_RO(zoned_cap); static ssize_t max_retries_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdev = sdkp->device; int retries, err; err = kstrtoint(buf, 10, &retries); if (err) return err; if (retries == SCSI_CMD_RETRIES_NO_LIMIT || retries <= SD_MAX_RETRIES) { sdkp->max_retries = retries; return count; } sdev_printk(KERN_ERR, sdev, "max_retries must be between -1 and %d\n", SD_MAX_RETRIES); return -EINVAL; } static ssize_t max_retries_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%d\n", sdkp->max_retries); } static DEVICE_ATTR_RW(max_retries); static struct attribute *sd_disk_attrs[] = { &dev_attr_cache_type.attr, &dev_attr_FUA.attr, &dev_attr_allow_restart.attr, &dev_attr_manage_start_stop.attr, &dev_attr_manage_system_start_stop.attr, &dev_attr_manage_runtime_start_stop.attr, &dev_attr_manage_shutdown.attr, &dev_attr_protection_type.attr, &dev_attr_protection_mode.attr, &dev_attr_app_tag_own.attr, &dev_attr_thin_provisioning.attr, &dev_attr_provisioning_mode.attr, &dev_attr_zeroing_mode.attr, &dev_attr_max_write_same_blocks.attr, &dev_attr_max_medium_access_timeouts.attr, &dev_attr_zoned_cap.attr, &dev_attr_max_retries.attr, NULL, }; ATTRIBUTE_GROUPS(sd_disk); static struct class sd_disk_class = { .name = "scsi_disk", .dev_release = scsi_disk_release, .dev_groups = sd_disk_groups, }; /* * Don't request a new module, as that could deadlock in multipath * environment. */ static void sd_default_probe(dev_t devt) { } /* * Device no to disk mapping: * * major disc2 disc p1 * |............|.............|....|....| <- dev_t * 31 20 19 8 7 4 3 0 * * Inside a major, we have 16k disks, however mapped non- * contiguously. The first 16 disks are for major0, the next * ones with major1, ... Disk 256 is for major0 again, disk 272 * for major1, ... * As we stay compatible with our numbering scheme, we can reuse * the well-know SCSI majors 8, 65--71, 136--143. */ static int sd_major(int major_idx) { switch (major_idx) { case 0: return SCSI_DISK0_MAJOR; case 1 ... 7: return SCSI_DISK1_MAJOR + major_idx - 1; case 8 ... 15: return SCSI_DISK8_MAJOR + major_idx - 8; default: BUG(); return 0; /* shut up gcc */ } } #ifdef CONFIG_BLK_SED_OPAL static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send) { struct scsi_disk *sdkp = data; struct scsi_device *sdev = sdkp->device; u8 cdb[12] = { 0, }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, }; int ret; cdb[0] = send ? SECURITY_PROTOCOL_OUT : SECURITY_PROTOCOL_IN; cdb[1] = secp; put_unaligned_be16(spsp, &cdb[2]); put_unaligned_be32(len, &cdb[6]); ret = scsi_execute_cmd(sdev, cdb, send ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, buffer, len, SD_TIMEOUT, sdkp->max_retries, &exec_args); return ret <= 0 ? ret : -EIO; } #endif /* CONFIG_BLK_SED_OPAL */ /* * Look up the DIX operation based on whether the command is read or * write and whether dix and dif are enabled. */ static unsigned int sd_prot_op(bool write, bool dix, bool dif) { /* Lookup table: bit 2 (write), bit 1 (dix), bit 0 (dif) */ static const unsigned int ops[] = { /* wrt dix dif */ SCSI_PROT_NORMAL, /* 0 0 0 */ SCSI_PROT_READ_STRIP, /* 0 0 1 */ SCSI_PROT_READ_INSERT, /* 0 1 0 */ SCSI_PROT_READ_PASS, /* 0 1 1 */ SCSI_PROT_NORMAL, /* 1 0 0 */ SCSI_PROT_WRITE_INSERT, /* 1 0 1 */ SCSI_PROT_WRITE_STRIP, /* 1 1 0 */ SCSI_PROT_WRITE_PASS, /* 1 1 1 */ }; return ops[write << 2 | dix << 1 | dif]; } /* * Returns a mask of the protection flags that are valid for a given DIX * operation. */ static unsigned int sd_prot_flag_mask(unsigned int prot_op) { static const unsigned int flag_mask[] = { [SCSI_PROT_NORMAL] = 0, [SCSI_PROT_READ_STRIP] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT, [SCSI_PROT_READ_INSERT] = SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_READ_PASS] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_WRITE_INSERT] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_REF_INCREMENT, [SCSI_PROT_WRITE_STRIP] = SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, [SCSI_PROT_WRITE_PASS] = SCSI_PROT_TRANSFER_PI | SCSI_PROT_GUARD_CHECK | SCSI_PROT_REF_CHECK | SCSI_PROT_REF_INCREMENT | SCSI_PROT_IP_CHECKSUM, }; return flag_mask[prot_op]; } static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, unsigned int dix, unsigned int dif) { struct request *rq = scsi_cmd_to_rq(scmd); struct bio *bio = rq->bio; unsigned int prot_op = sd_prot_op(rq_data_dir(rq), dix, dif); unsigned int protect = 0; if (dix) { /* DIX Type 0, 1, 2, 3 */ if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; } if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) scmd->prot_flags |= SCSI_PROT_REF_CHECK; } if (dif) { /* DIX/DIF Type 1, 2, 3 */ scmd->prot_flags |= SCSI_PROT_TRANSFER_PI; if (bio_integrity_flagged(bio, BIP_DISK_NOCHECK)) protect = 3 << 5; /* Disable target PI checking */ else protect = 1 << 5; /* Enable target PI checking */ } scsi_set_prot_op(scmd, prot_op); scsi_set_prot_type(scmd, dif); scmd->prot_flags &= sd_prot_flag_mask(prot_op); return protect; } static void sd_disable_discard(struct scsi_disk *sdkp) { sdkp->provisioning_mode = SD_LBP_DISABLE; blk_queue_disable_discard(sdkp->disk->queue); } static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim, unsigned int mode) { unsigned int logical_block_size = sdkp->device->sector_size; unsigned int max_blocks = 0; lim->discard_alignment = sdkp->unmap_alignment * logical_block_size; lim->discard_granularity = max(sdkp->physical_block_size, sdkp->unmap_granularity * logical_block_size); sdkp->provisioning_mode = mode; switch (mode) { case SD_LBP_FULL: case SD_LBP_DISABLE: break; case SD_LBP_UNMAP: max_blocks = min_not_zero(sdkp->max_unmap_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS16: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else max_blocks = sdkp->max_ws_blocks; max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS10: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else max_blocks = sdkp->max_ws_blocks; max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS); break; case SD_LBP_ZERO: max_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS10_BLOCKS); break; } lim->max_hw_discard_sectors = max_blocks * (logical_block_size >> SECTOR_SHIFT); } static void *sd_set_special_bvec(struct request *rq, unsigned int data_len) { struct page *page; page = mempool_alloc(sd_page_pool, GFP_ATOMIC); if (!page) return NULL; clear_highpage(page); bvec_set_page(&rq->special_vec, page, data_len, 0); rq->rq_flags |= RQF_SPECIAL_PAYLOAD; return bvec_virt(&rq->special_vec); } static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int data_len = 24; char *buf; buf = sd_set_special_bvec(rq, data_len); if (!buf) return BLK_STS_RESOURCE; cmd->cmd_len = 10; cmd->cmnd[0] = UNMAP; cmd->cmnd[8] = 24; put_unaligned_be16(6 + 16, &buf[0]); put_unaligned_be16(16, &buf[2]); put_unaligned_be64(lba, &buf[8]); put_unaligned_be32(nr_blocks, &buf[16]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = SD_TIMEOUT; return scsi_alloc_sgtables(cmd); } static void sd_config_atomic(struct scsi_disk *sdkp, struct queue_limits *lim) { unsigned int logical_block_size = sdkp->device->sector_size, physical_block_size_sectors, max_atomic, unit_min, unit_max; if ((!sdkp->max_atomic && !sdkp->max_atomic_with_boundary) || sdkp->protection_type == T10_PI_TYPE2_PROTECTION) return; physical_block_size_sectors = sdkp->physical_block_size / sdkp->device->sector_size; unit_min = rounddown_pow_of_two(sdkp->atomic_granularity ? sdkp->atomic_granularity : physical_block_size_sectors); /* * Only use atomic boundary when we have the odd scenario of * sdkp->max_atomic == 0, which the spec does permit. */ if (sdkp->max_atomic) { max_atomic = sdkp->max_atomic; unit_max = rounddown_pow_of_two(sdkp->max_atomic); sdkp->use_atomic_write_boundary = 0; } else { max_atomic = sdkp->max_atomic_with_boundary; unit_max = rounddown_pow_of_two(sdkp->max_atomic_boundary); sdkp->use_atomic_write_boundary = 1; } /* * Ensure compliance with granularity and alignment. For now, keep it * simple and just don't support atomic writes for values mismatched * with max_{boundary}atomic, physical block size, and * atomic_granularity itself. * * We're really being distrustful by checking unit_max also... */ if (sdkp->atomic_granularity > 1) { if (unit_min > 1 && unit_min % sdkp->atomic_granularity) return; if (unit_max > 1 && unit_max % sdkp->atomic_granularity) return; } if (sdkp->atomic_alignment > 1) { if (unit_min > 1 && unit_min % sdkp->atomic_alignment) return; if (unit_max > 1 && unit_max % sdkp->atomic_alignment) return; } lim->atomic_write_hw_max = max_atomic * logical_block_size; lim->atomic_write_hw_boundary = 0; lim->atomic_write_hw_unit_min = unit_min * logical_block_size; lim->atomic_write_hw_unit_max = unit_max * logical_block_size; } static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_SAME_16; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be32(nr_blocks, &cmd->cmnd[10]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; cmd->cmd_len = 10; cmd->cmnd[0] = WRITE_SAME; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ put_unaligned_be32(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[7]); cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; return scsi_alloc_sgtables(cmd); } static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); if (!(rq->cmd_flags & REQ_NOUNMAP)) { switch (sdkp->zeroing_mode) { case SD_ZERO_WS16_UNMAP: return sd_setup_write_same16_cmnd(cmd, true); case SD_ZERO_WS10_UNMAP: return sd_setup_write_same10_cmnd(cmd, true); } } if (sdp->no_write_same) { rq->rq_flags |= RQF_QUIET; return BLK_STS_TARGET; } if (sdkp->ws16 || lba > 0xffffffff || nr_blocks > 0xffff) return sd_setup_write_same16_cmnd(cmd, false); return sd_setup_write_same10_cmnd(cmd, false); } static void sd_disable_write_same(struct scsi_disk *sdkp) { sdkp->device->no_write_same = 1; sdkp->max_ws_blocks = 0; blk_queue_disable_write_zeroes(sdkp->disk->queue); } static void sd_config_write_same(struct scsi_disk *sdkp, struct queue_limits *lim) { unsigned int logical_block_size = sdkp->device->sector_size; if (sdkp->device->no_write_same) { sdkp->max_ws_blocks = 0; goto out; } /* Some devices can not handle block counts above 0xffff despite * supporting WRITE SAME(16). Consequently we default to 64k * blocks per I/O unless the device explicitly advertises a * bigger limit. */ if (sdkp->max_ws_blocks > SD_MAX_WS10_BLOCKS) sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS16_BLOCKS); else if (sdkp->ws16 || sdkp->ws10 || sdkp->device->no_report_opcodes) sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS10_BLOCKS); else { sdkp->device->no_write_same = 1; sdkp->max_ws_blocks = 0; } if (sdkp->lbprz && sdkp->lbpws) sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP; else if (sdkp->lbprz && sdkp->lbpws10) sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP; else if (sdkp->max_ws_blocks) sdkp->zeroing_mode = SD_ZERO_WS; else sdkp->zeroing_mode = SD_ZERO_WRITE; if (sdkp->max_ws_blocks && sdkp->physical_block_size > logical_block_size) { /* * Reporting a maximum number of blocks that is not aligned * on the device physical size would cause a large write same * request to be split into physically unaligned chunks by * __blkdev_issue_write_zeroes() even if the caller of this * functions took care to align the large request. So make sure * the maximum reported is aligned to the device physical block * size. This is only an optional optimization for regular * disks, but this is mandatory to avoid failure of large write * same requests directed at sequential write required zones of * host-managed ZBC disks. */ sdkp->max_ws_blocks = round_down(sdkp->max_ws_blocks, bytes_to_logical(sdkp->device, sdkp->physical_block_size)); } out: lim->max_write_zeroes_sectors = sdkp->max_ws_blocks * (logical_block_size >> SECTOR_SHIFT); } static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); /* flush requests don't perform I/O, zero the S/G table */ memset(&cmd->sdb, 0, sizeof(cmd->sdb)); if (cmd->device->use_16_for_sync) { cmd->cmnd[0] = SYNCHRONIZE_CACHE_16; cmd->cmd_len = 16; } else { cmd->cmnd[0] = SYNCHRONIZE_CACHE; cmd->cmd_len = 10; } cmd->transfersize = 0; cmd->allowed = sdkp->max_retries; rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; return BLK_STS_OK; } /** * sd_group_number() - Compute the GROUP NUMBER field * @cmd: SCSI command for which to compute the value of the six-bit GROUP NUMBER * field. * * From SBC-5 r05 (https://www.t10.org/cgi-bin/ac.pl?t=f&f=sbc5r05.pdf): * 0: no relative lifetime. * 1: shortest relative lifetime. * 2: second shortest relative lifetime. * 3 - 0x3d: intermediate relative lifetimes. * 0x3e: second longest relative lifetime. * 0x3f: longest relative lifetime. */ static u8 sd_group_number(struct scsi_cmnd *cmd) { const struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); if (!sdkp->rscs) return 0; return min3((u32)rq->write_hint, (u32)sdkp->permanent_stream_count, 0x3fu); } static blk_status_t sd_setup_rw32_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags, unsigned int dld) { cmd->cmd_len = SD_EXT_CDB_SIZE; cmd->cmnd[0] = VARIABLE_LENGTH_CMD; cmd->cmnd[6] = sd_group_number(cmd); cmd->cmnd[7] = 0x18; /* Additional CDB len */ cmd->cmnd[9] = write ? WRITE_32 : READ_32; cmd->cmnd[10] = flags; cmd->cmnd[11] = dld & 0x07; put_unaligned_be64(lba, &cmd->cmnd[12]); put_unaligned_be32(lba, &cmd->cmnd[20]); /* Expected Indirect LBA */ put_unaligned_be32(nr_blocks, &cmd->cmnd[28]); return BLK_STS_OK; } static blk_status_t sd_setup_rw16_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags, unsigned int dld) { cmd->cmd_len = 16; cmd->cmnd[0] = write ? WRITE_16 : READ_16; cmd->cmnd[1] = flags | ((dld >> 2) & 0x01); cmd->cmnd[14] = ((dld & 0x03) << 6) | sd_group_number(cmd); cmd->cmnd[15] = 0; put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be32(nr_blocks, &cmd->cmnd[10]); return BLK_STS_OK; } static blk_status_t sd_setup_rw10_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags) { cmd->cmd_len = 10; cmd->cmnd[0] = write ? WRITE_10 : READ_10; cmd->cmnd[1] = flags; cmd->cmnd[6] = sd_group_number(cmd); cmd->cmnd[9] = 0; put_unaligned_be32(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[7]); return BLK_STS_OK; } static blk_status_t sd_setup_rw6_cmnd(struct scsi_cmnd *cmd, bool write, sector_t lba, unsigned int nr_blocks, unsigned char flags) { /* Avoid that 0 blocks gets translated into 256 blocks. */ if (WARN_ON_ONCE(nr_blocks == 0)) return BLK_STS_IOERR; if (unlikely(flags & 0x8)) { /* * This happens only if this drive failed 10byte rw * command with ILLEGAL_REQUEST during operation and * thus turned off use_10_for_rw. */ scmd_printk(KERN_ERR, cmd, "FUA write on READ/WRITE(6) drive\n"); return BLK_STS_IOERR; } cmd->cmd_len = 6; cmd->cmnd[0] = write ? WRITE_6 : READ_6; cmd->cmnd[1] = (lba >> 16) & 0x1f; cmd->cmnd[2] = (lba >> 8) & 0xff; cmd->cmnd[3] = lba & 0xff; cmd->cmnd[4] = nr_blocks; cmd->cmnd[5] = 0; return BLK_STS_OK; } /* * Check if a command has a duration limit set. If it does, and the target * device supports CDL and the feature is enabled, return the limit * descriptor index to use. Return 0 (no limit) otherwise. */ static int sd_cdl_dld(struct scsi_disk *sdkp, struct scsi_cmnd *scmd) { struct scsi_device *sdp = sdkp->device; int hint; if (!sdp->cdl_supported || !sdp->cdl_enable) return 0; /* * Use "no limit" if the request ioprio does not specify a duration * limit hint. */ hint = IOPRIO_PRIO_HINT(req_get_ioprio(scsi_cmd_to_rq(scmd))); if (hint < IOPRIO_HINT_DEV_DURATION_LIMIT_1 || hint > IOPRIO_HINT_DEV_DURATION_LIMIT_7) return 0; return (hint - IOPRIO_HINT_DEV_DURATION_LIMIT_1) + 1; } static blk_status_t sd_setup_atomic_cmnd(struct scsi_cmnd *cmd, sector_t lba, unsigned int nr_blocks, bool boundary, unsigned char flags) { cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_ATOMIC_16; cmd->cmnd[1] = flags; put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be16(nr_blocks, &cmd->cmnd[12]); if (boundary) put_unaligned_be16(nr_blocks, &cmd->cmnd[10]); else put_unaligned_be16(0, &cmd->cmnd[10]); put_unaligned_be16(nr_blocks, &cmd->cmnd[12]); cmd->cmnd[14] = 0; cmd->cmnd[15] = 0; return BLK_STS_OK; } static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t lba = sectors_to_logical(sdp, blk_rq_pos(rq)); sector_t threshold; unsigned int nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int mask = logical_to_sectors(sdp, 1) - 1; bool write = rq_data_dir(rq) == WRITE; unsigned char protect, fua; unsigned int dld; blk_status_t ret; unsigned int dif; bool dix; ret = scsi_alloc_sgtables(cmd); if (ret != BLK_STS_OK) return ret; ret = BLK_STS_IOERR; if (!scsi_device_online(sdp) || sdp->changed) { scmd_printk(KERN_ERR, cmd, "device offline or changed\n"); goto fail; } if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->q->disk)) { scmd_printk(KERN_ERR, cmd, "access beyond end of device\n"); goto fail; } if ((blk_rq_pos(rq) & mask) || (blk_rq_sectors(rq) & mask)) { scmd_printk(KERN_ERR, cmd, "request not aligned to the logical block size\n"); goto fail; } /* * Some SD card readers can't handle accesses which touch the * last one or two logical blocks. Split accesses as needed. */ threshold = sdkp->capacity - SD_LAST_BUGGY_SECTORS; if (unlikely(sdp->last_sector_bug && lba + nr_blocks > threshold)) { if (lba < threshold) { /* Access up to the threshold but not beyond */ nr_blocks = threshold - lba; } else { /* Access only a single logical block */ nr_blocks = 1; } } fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0; dix = scsi_prot_sg_count(cmd); dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); dld = sd_cdl_dld(sdkp, cmd); if (dif || dix) protect = sd_setup_protect_cmnd(cmd, dix, dif); else protect = 0; if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) { ret = sd_setup_rw32_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); } else if (rq->cmd_flags & REQ_ATOMIC) { ret = sd_setup_atomic_cmnd(cmd, lba, nr_blocks, sdkp->use_atomic_write_boundary, protect | fua); } else if (sdp->use_16_for_rw || (nr_blocks > 0xffff)) { ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks, protect | fua, dld); } else if ((nr_blocks > 0xff) || (lba > 0x1fffff) || sdp->use_10_for_rw || protect || rq->write_hint) { ret = sd_setup_rw10_cmnd(cmd, write, lba, nr_blocks, protect | fua); } else { ret = sd_setup_rw6_cmnd(cmd, write, lba, nr_blocks, protect | fua); } if (unlikely(ret != BLK_STS_OK)) goto fail; /* * We shouldn't disconnect in the middle of a sector, so with a dumb * host adapter, it's safe to assume that we can at least transfer * this many bytes between each connect / disconnect. */ cmd->transfersize = sdp->sector_size; cmd->underflow = nr_blocks << 9; cmd->allowed = sdkp->max_retries; cmd->sdb.length = nr_blocks * sdp->sector_size; SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, cmd, "%s: block=%llu, count=%d\n", __func__, (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq))); SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, cmd, "%s %d/%u 512 byte blocks.\n", write ? "writing" : "reading", nr_blocks, blk_rq_sectors(rq))); /* * This indicates that the command is ready from our end to be queued. */ return BLK_STS_OK; fail: scsi_free_sgtables(cmd); return ret; } static blk_status_t sd_init_command(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); switch (req_op(rq)) { case REQ_OP_DISCARD: switch (scsi_disk(rq->q->disk)->provisioning_mode) { case SD_LBP_UNMAP: return sd_setup_unmap_cmnd(cmd); case SD_LBP_WS16: return sd_setup_write_same16_cmnd(cmd, true); case SD_LBP_WS10: return sd_setup_write_same10_cmnd(cmd, true); case SD_LBP_ZERO: return sd_setup_write_same10_cmnd(cmd, false); default: return BLK_STS_TARGET; } case REQ_OP_WRITE_ZEROES: return sd_setup_write_zeroes_cmnd(cmd); case REQ_OP_FLUSH: return sd_setup_flush_cmnd(cmd); case REQ_OP_READ: case REQ_OP_WRITE: return sd_setup_read_write_cmnd(cmd); case REQ_OP_ZONE_RESET: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, false); case REQ_OP_ZONE_RESET_ALL: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, true); case REQ_OP_ZONE_OPEN: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_OPEN_ZONE, false); case REQ_OP_ZONE_CLOSE: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_CLOSE_ZONE, false); case REQ_OP_ZONE_FINISH: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_FINISH_ZONE, false); default: WARN_ON_ONCE(1); return BLK_STS_NOTSUPP; } } static void sd_uninit_command(struct scsi_cmnd *SCpnt) { struct request *rq = scsi_cmd_to_rq(SCpnt); if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) mempool_free(rq->special_vec.bv_page, sd_page_pool); } static bool sd_need_revalidate(struct gendisk *disk, struct scsi_disk *sdkp) { if (sdkp->device->removable || sdkp->write_prot) { if (disk_check_media_change(disk)) return true; } /* * Force a full rescan after ioctl(BLKRRPART). While the disk state has * nothing to do with partitions, BLKRRPART is used to force a full * revalidate after things like a format for historical reasons. */ return test_bit(GD_NEED_PART_SCAN, &disk->state); } /** * sd_open - open a scsi disk device * @disk: disk to open * @mode: open mode * * Returns 0 if successful. Returns a negated errno value in case * of error. * * Note: This can be called from a user context (e.g. fsck(1) ) * or from within the kernel (e.g. as a result of a mount(1) ). * In the latter case @inode and @filp carry an abridged amount * of information as noted above. * * Locking: called with disk->open_mutex held. **/ static int sd_open(struct gendisk *disk, blk_mode_t mode) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; int retval; if (scsi_device_get(sdev)) return -ENXIO; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n")); /* * If the device is in error recovery, wait until it is done. * If the device is offline, then disallow any access to it. */ retval = -ENXIO; if (!scsi_block_when_processing_errors(sdev)) goto error_out; if (sd_need_revalidate(disk, sdkp)) sd_revalidate_disk(disk); /* * If the drive is empty, just let the open fail. */ retval = -ENOMEDIUM; if (sdev->removable && !sdkp->media_present && !(mode & BLK_OPEN_NDELAY)) goto error_out; /* * If the device has the write protect tab set, have the open fail * if the user expects to be able to write to the thing. */ retval = -EROFS; if (sdkp->write_prot && (mode & BLK_OPEN_WRITE)) goto error_out; /* * It is possible that the disk changing stuff resulted in * the device being taken offline. If this is the case, * report this to the user, and don't pretend that the * open actually succeeded. */ retval = -ENXIO; if (!scsi_device_online(sdev)) goto error_out; if ((atomic_inc_return(&sdkp->openers) == 1) && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT); } return 0; error_out: scsi_device_put(sdev); return retval; } /** * sd_release - invoked when the (last) close(2) is called on this * scsi disk. * @disk: disk to release * * Returns 0. * * Note: may block (uninterruptible) if error recovery is underway * on this disk. * * Locking: called with disk->open_mutex held. **/ static void sd_release(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_release\n")); if (atomic_dec_return(&sdkp->openers) == 0 && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW); } scsi_device_put(sdev); } static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdp = sdkp->device; struct Scsi_Host *host = sdp->host; sector_t capacity = logical_to_sectors(sdp, sdkp->capacity); int diskinfo[4]; /* default to most commonly used values */ diskinfo[0] = 0x40; /* 1 << 6 */ diskinfo[1] = 0x20; /* 1 << 5 */ diskinfo[2] = capacity >> 11; /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) host->hostt->bios_param(sdp, bdev, capacity, diskinfo); else scsicam_bios_param(bdev, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; geo->cylinders = diskinfo[2]; return 0; } /** * sd_ioctl - process an ioctl * @bdev: target block device * @mode: open mode * @cmd: ioctl command number * @arg: this is third argument given to ioctl(2) system call. * Often contains a pointer. * * Returns 0 if successful (some ioctls return positive numbers on * success as well). Returns a negated errno value in case of error. * * Note: most ioctls are forward onto the block subsystem or further * down in the scsi subsystem. **/ static int sd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; void __user *p = (void __user *)arg; int error; SCSI_LOG_IOCTL(1, sd_printk(KERN_INFO, sdkp, "sd_ioctl: disk=%s, " "cmd=0x%x\n", disk->disk_name, cmd)); if (bdev_is_partition(bdev) && !capable(CAP_SYS_RAWIO)) return -ENOIOCTLCMD; /* * If we are in the middle of error recovery, don't let anyone * else try and use this device. Also, if error recovery fails, it * may try and take the device offline, in which case all further * access to the device is prohibited. */ error = scsi_ioctl_block_when_processing_errors(sdp, cmd, (mode & BLK_OPEN_NDELAY)); if (error) return error; if (is_sed_ioctl(cmd)) return sed_ioctl(sdkp->opal_dev, cmd, p); return scsi_ioctl(sdp, mode & BLK_OPEN_WRITE, cmd, p); } static void set_media_not_present(struct scsi_disk *sdkp) { if (sdkp->media_present) sdkp->device->changed = 1; if (sdkp->device->removable) { sdkp->media_present = 0; sdkp->capacity = 0; } } static int media_not_present(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { if (!scsi_sense_valid(sshdr)) return 0; /* not invoked for commands that could return deferred errors */ switch (sshdr->sense_key) { case UNIT_ATTENTION: case NOT_READY: /* medium not present */ if (sshdr->asc == 0x3A) { set_media_not_present(sdkp); return 1; } } return 0; } /** * sd_check_events - check media events * @disk: kernel device descriptor * @clearing: disk events currently being cleared * * Returns mask of DISK_EVENT_*. * * Note: this function is invoked from the block subsystem. **/ static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing) { struct scsi_disk *sdkp = disk->private_data; struct scsi_device *sdp; int retval; bool disk_changed; if (!sdkp) return 0; sdp = sdkp->device; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_check_events\n")); /* * If the device is offline, don't send any commands - just pretend as * if the command failed. If the device ever comes back online, we * can deal with it then. It is only because of unrecoverable errors * that we would ever take a device offline in the first place. */ if (!scsi_device_online(sdp)) { set_media_not_present(sdkp); goto out; } /* * Using TEST_UNIT_READY enables differentiation between drive with * no cartridge loaded - NOT READY, drive with changed cartridge - * UNIT ATTENTION, or with same cartridge - GOOD STATUS. * * Drives that auto spin down. eg iomega jaz 1G, will be started * by sd_spinup_disk() from sd_revalidate_disk(), which happens whenever * sd_revalidate() is called. */ if (scsi_block_when_processing_errors(sdp)) { struct scsi_sense_hdr sshdr = { 0, }; retval = scsi_test_unit_ready(sdp, SD_TIMEOUT, sdkp->max_retries, &sshdr); /* failed to execute TUR, assume media not present */ if (retval < 0 || host_byte(retval)) { set_media_not_present(sdkp); goto out; } if (media_not_present(sdkp, &sshdr)) goto out; } /* * For removable scsi disk we have to recognise the presence * of a disk in the drive. */ if (!sdkp->media_present) sdp->changed = 1; sdkp->media_present = 1; out: /* * sdp->changed is set under the following conditions: * * Medium present state has changed in either direction. * Device has indicated UNIT_ATTENTION. */ disk_changed = sdp->changed; sdp->changed = 0; return disk_changed ? DISK_EVENT_MEDIA_CHANGE : 0; } static int sd_sync_cache(struct scsi_disk *sdkp) { int res; struct scsi_device *sdp = sdkp->device; const int timeout = sdp->request_queue->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; /* Leave the rest of the command zero to indicate flush everything. */ const unsigned char cmd[16] = { sdp->use_16_for_sync ? SYNCHRONIZE_CACHE_16 : SYNCHRONIZE_CACHE }; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { { .allowed = 3, .result = SCMD_FAILURE_RESULT_ANY, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, .sshdr = &sshdr, .failures = &failures, }; if (!scsi_device_online(sdp)) return -ENODEV; res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, timeout, sdkp->max_retries, &exec_args); if (res) { sd_print_result(sdkp, "Synchronize Cache(10) failed", res); if (res < 0) return res; if (scsi_status_is_check_condition(res) && scsi_sense_valid(&sshdr)) { sd_print_sense_hdr(sdkp, &sshdr); /* we need to evaluate the error return */ if (sshdr.asc == 0x3a || /* medium not present */ sshdr.asc == 0x20 || /* invalid command */ (sshdr.asc == 0x74 && sshdr.ascq == 0x71)) /* drive is password locked */ /* this is no error here */ return 0; /* * If a format is in progress or if the drive does not * support sync, there is not much we can do because * this is called during shutdown or suspend so just * return success so those operations can proceed. */ if ((sshdr.asc == 0x04 && sshdr.ascq == 0x04) || sshdr.sense_key == ILLEGAL_REQUEST) return 0; } switch (host_byte(res)) { /* ignore errors due to racing a disconnection */ case DID_BAD_TARGET: case DID_NO_CONNECT: return 0; /* signal the upper layer it might try again */ case DID_BUS_BUSY: case DID_IMM_RETRY: case DID_REQUEUE: case DID_SOFT_ERROR: return -EBUSY; default: return -EIO; } } return 0; } static void sd_rescan(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); sd_revalidate_disk(sdkp->disk); } static int sd_get_unique_id(struct gendisk *disk, u8 id[16], enum blk_unique_id type) { struct scsi_device *sdev = scsi_disk(disk)->device; const struct scsi_vpd *vpd; const unsigned char *d; int ret = -ENXIO, len; rcu_read_lock(); vpd = rcu_dereference(sdev->vpd_pg83); if (!vpd) goto out_unlock; ret = -EINVAL; for (d = vpd->data + 4; d < vpd->data + vpd->len; d += d[3] + 4) { /* we only care about designators with LU association */ if (((d[1] >> 4) & 0x3) != 0x00) continue; if ((d[1] & 0xf) != type) continue; /* * Only exit early if a 16-byte descriptor was found. Otherwise * keep looking as one with more entropy might still show up. */ len = d[3]; if (len != 8 && len != 12 && len != 16) continue; ret = len; memcpy(id, d + 4, len); if (len == 16) break; } out_unlock: rcu_read_unlock(); return ret; } static int sd_scsi_to_pr_err(struct scsi_sense_hdr *sshdr, int result) { switch (host_byte(result)) { case DID_TRANSPORT_MARGINAL: case DID_TRANSPORT_DISRUPTED: case DID_BUS_BUSY: return PR_STS_RETRY_PATH_FAILURE; case DID_NO_CONNECT: return PR_STS_PATH_FAILED; case DID_TRANSPORT_FAILFAST: return PR_STS_PATH_FAST_FAILED; } switch (status_byte(result)) { case SAM_STAT_RESERVATION_CONFLICT: return PR_STS_RESERVATION_CONFLICT; case SAM_STAT_CHECK_CONDITION: if (!scsi_sense_valid(sshdr)) return PR_STS_IOERR; if (sshdr->sense_key == ILLEGAL_REQUEST && (sshdr->asc == 0x26 || sshdr->asc == 0x24)) return -EINVAL; fallthrough; default: return PR_STS_IOERR; } } static int sd_pr_in_command(struct block_device *bdev, u8 sa, unsigned char *data, int data_len) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; u8 cmd[10] = { PERSISTENT_RESERVE_IN, sa }; struct scsi_failure failure_defs[] = { { .sense = UNIT_ATTENTION, .asc = SCMD_FAILURE_ASC_ANY, .ascq = SCMD_FAILURE_ASCQ_ANY, .allowed = 5, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; int result; put_unaligned_be16(data_len, &cmd[7]); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, data, data_len, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (scsi_status_is_check_condition(result) && scsi_sense_valid(&sshdr)) { sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result); scsi_print_sense_hdr(sdev, NULL, &sshdr); } if (result <= 0) return result; return sd_scsi_to_pr_err(&sshdr, result); } static int sd_pr_read_keys(struct block_device *bdev, struct pr_keys *keys_info) { int result, i, data_offset, num_copy_keys; u32 num_keys = keys_info->num_keys; int data_len = num_keys * 8 + 8; u8 *data; data = kzalloc(data_len, GFP_KERNEL); if (!data) return -ENOMEM; result = sd_pr_in_command(bdev, READ_KEYS, data, data_len); if (result) goto free_data; keys_info->generation = get_unaligned_be32(&data[0]); keys_info->num_keys = get_unaligned_be32(&data[4]) / 8; data_offset = 8; num_copy_keys = min(num_keys, keys_info->num_keys); for (i = 0; i < num_copy_keys; i++) { keys_info->keys[i] = get_unaligned_be64(&data[data_offset]); data_offset += 8; } free_data: kfree(data); return result; } static int sd_pr_read_reservation(struct block_device *bdev, struct pr_held_reservation *rsv) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; u8 data[24] = { }; int result, len; result = sd_pr_in_command(bdev, READ_RESERVATION, data, sizeof(data)); if (result) return result; len = get_unaligned_be32(&data[4]); if (!len) return 0; /* Make sure we have at least the key and type */ if (len < 14) { sdev_printk(KERN_INFO, sdev, "READ RESERVATION failed due to short return buffer of %d bytes\n", len); return -EINVAL; } rsv->generation = get_unaligned_be32(&data[0]); rsv->key = get_unaligned_be64(&data[8]); rsv->type = scsi_pr_type_to_block(data[21] & 0x0f); return 0; } static int sd_pr_out_command(struct block_device *bdev, u8 sa, u64 key, u64 sa_key, enum scsi_pr_type type, u8 flags) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { { .sense = UNIT_ATTENTION, .asc = SCMD_FAILURE_ASC_ANY, .ascq = SCMD_FAILURE_ASCQ_ANY, .allowed = 5, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; int result; u8 cmd[16] = { 0, }; u8 data[24] = { 0, }; cmd[0] = PERSISTENT_RESERVE_OUT; cmd[1] = sa; cmd[2] = type; put_unaligned_be32(sizeof(data), &cmd[5]); put_unaligned_be64(key, &data[0]); put_unaligned_be64(sa_key, &data[8]); data[20] = flags; result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_OUT, &data, sizeof(data), SD_TIMEOUT, sdkp->max_retries, &exec_args); if (scsi_status_is_check_condition(result) && scsi_sense_valid(&sshdr)) { sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result); scsi_print_sense_hdr(sdev, NULL, &sshdr); } if (result <= 0) return result; return sd_scsi_to_pr_err(&sshdr, result); } static int sd_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, u32 flags) { if (flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; return sd_pr_out_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00, old_key, new_key, 0, (1 << 0) /* APTPL */); } static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, u32 flags) { if (flags) return -EOPNOTSUPP; return sd_pr_out_command(bdev, 0x01, key, 0, block_pr_type_to_scsi(type), 0); } static int sd_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { return sd_pr_out_command(bdev, 0x02, key, 0, block_pr_type_to_scsi(type), 0); } static int sd_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, enum pr_type type, bool abort) { return sd_pr_out_command(bdev, abort ? 0x05 : 0x04, old_key, new_key, block_pr_type_to_scsi(type), 0); } static int sd_pr_clear(struct block_device *bdev, u64 key) { return sd_pr_out_command(bdev, 0x03, key, 0, 0, 0); } static const struct pr_ops sd_pr_ops = { .pr_register = sd_pr_register, .pr_reserve = sd_pr_reserve, .pr_release = sd_pr_release, .pr_preempt = sd_pr_preempt, .pr_clear = sd_pr_clear, .pr_read_keys = sd_pr_read_keys, .pr_read_reservation = sd_pr_read_reservation, }; static void scsi_disk_free_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); put_device(&sdkp->disk_dev); } static const struct block_device_operations sd_fops = { .owner = THIS_MODULE, .open = sd_open, .release = sd_release, .ioctl = sd_ioctl, .getgeo = sd_getgeo, .compat_ioctl = blkdev_compat_ptr_ioctl, .check_events = sd_check_events, .unlock_native_capacity = sd_unlock_native_capacity, .report_zones = sd_zbc_report_zones, .get_unique_id = sd_get_unique_id, .free_disk = scsi_disk_free_disk, .pr_ops = &sd_pr_ops, }; /** * sd_eh_reset - reset error handling callback * @scmd: sd-issued command that has failed * * This function is called by the SCSI midlayer before starting * SCSI EH. When counting medium access failures we have to be * careful to register it only only once per device and SCSI EH run; * there might be several timed out commands which will cause the * 'max_medium_access_timeouts' counter to trigger after the first * SCSI EH run already and set the device to offline. * So this function resets the internal counter before starting SCSI EH. **/ static void sd_eh_reset(struct scsi_cmnd *scmd) { struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); /* New SCSI EH run, reset gate variable */ sdkp->ignore_medium_access_errors = false; } /** * sd_eh_action - error handling callback * @scmd: sd-issued command that has failed * @eh_disp: The recovery disposition suggested by the midlayer * * This function is called by the SCSI midlayer upon completion of an * error test command (currently TEST UNIT READY). The result of sending * the eh command is passed in eh_disp. We're looking for devices that * fail medium access commands but are OK with non access commands like * test unit ready (so wrongly see the device as having a successful * recovery) **/ static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp) { struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); struct scsi_device *sdev = scmd->device; if (!scsi_device_online(sdev) || !scsi_medium_access_command(scmd) || host_byte(scmd->result) != DID_TIME_OUT || eh_disp != SUCCESS) return eh_disp; /* * The device has timed out executing a medium access command. * However, the TEST UNIT READY command sent during error * handling completed successfully. Either the device is in the * process of recovering or has it suffered an internal failure * that prevents access to the storage medium. */ if (!sdkp->ignore_medium_access_errors) { sdkp->medium_access_timed_out++; sdkp->ignore_medium_access_errors = true; } /* * If the device keeps failing read/write commands but TEST UNIT * READY always completes successfully we assume that medium * access is no longer possible and take the device offline. */ if (sdkp->medium_access_timed_out >= sdkp->max_medium_access_timeouts) { scmd_printk(KERN_ERR, scmd, "Medium access timeout failure. Offlining disk!\n"); mutex_lock(&sdev->state_mutex); scsi_device_set_state(sdev, SDEV_OFFLINE); mutex_unlock(&sdev->state_mutex); return SUCCESS; } return eh_disp; } static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd) { struct request *req = scsi_cmd_to_rq(scmd); struct scsi_device *sdev = scmd->device; unsigned int transferred, good_bytes; u64 start_lba, end_lba, bad_lba; /* * Some commands have a payload smaller than the device logical * block size (e.g. INQUIRY on a 4K disk). */ if (scsi_bufflen(scmd) <= sdev->sector_size) return 0; /* Check if we have a 'bad_lba' information */ if (!scsi_get_sense_info_fld(scmd->sense_buffer, SCSI_SENSE_BUFFERSIZE, &bad_lba)) return 0; /* * If the bad lba was reported incorrectly, we have no idea where * the error is. */ start_lba = sectors_to_logical(sdev, blk_rq_pos(req)); end_lba = start_lba + bytes_to_logical(sdev, scsi_bufflen(scmd)); if (bad_lba < start_lba || bad_lba >= end_lba) return 0; /* * resid is optional but mostly filled in. When it's unused, * its value is zero, so we assume the whole buffer transferred */ transferred = scsi_bufflen(scmd) - scsi_get_resid(scmd); /* This computation should always be done in terms of the * resolution of the device's medium. */ good_bytes = logical_to_bytes(sdev, bad_lba - start_lba); return min(good_bytes, transferred); } /** * sd_done - bottom half handler: called when the lower level * driver has completed (successfully or otherwise) a scsi command. * @SCpnt: mid-level's per command structure. * * Note: potentially run from within an ISR. Must not block. **/ static int sd_done(struct scsi_cmnd *SCpnt) { int result = SCpnt->result; unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt); unsigned int sector_size = SCpnt->device->sector_size; unsigned int resid; struct scsi_sense_hdr sshdr; struct request *req = scsi_cmd_to_rq(SCpnt); struct scsi_disk *sdkp = scsi_disk(req->q->disk); int sense_valid = 0; int sense_deferred = 0; switch (req_op(req)) { case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_RESET_ALL: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: if (!result) { good_bytes = blk_rq_bytes(req); scsi_set_resid(SCpnt, 0); } else { good_bytes = 0; scsi_set_resid(SCpnt, blk_rq_bytes(req)); } break; default: /* * In case of bogus fw or device, we could end up having * an unaligned partial completion. Check this here and force * alignment. */ resid = scsi_get_resid(SCpnt); if (resid & (sector_size - 1)) { sd_printk(KERN_INFO, sdkp, "Unaligned partial completion (resid=%u, sector_sz=%u)\n", resid, sector_size); scsi_print_command(SCpnt); resid = min(scsi_bufflen(SCpnt), round_up(resid, sector_size)); scsi_set_resid(SCpnt, resid); } } if (result) { sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr); if (sense_valid) sense_deferred = scsi_sense_is_deferred(&sshdr); } sdkp->medium_access_timed_out = 0; if (!scsi_status_is_check_condition(result) && (!sense_valid || sense_deferred)) goto out; switch (sshdr.sense_key) { case HARDWARE_ERROR: case MEDIUM_ERROR: good_bytes = sd_completed_bytes(SCpnt); break; case RECOVERED_ERROR: good_bytes = scsi_bufflen(SCpnt); break; case NO_SENSE: /* This indicates a false check condition, so ignore it. An * unknown amount of data was transferred so treat it as an * error. */ SCpnt->result = 0; memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); break; case ABORTED_COMMAND: if (sshdr.asc == 0x10) /* DIF: Target detected corruption */ good_bytes = sd_completed_bytes(SCpnt); break; case ILLEGAL_REQUEST: switch (sshdr.asc) { case 0x10: /* DIX: Host detected corruption */ good_bytes = sd_completed_bytes(SCpnt); break; case 0x20: /* INVALID COMMAND OPCODE */ case 0x24: /* INVALID FIELD IN CDB */ switch (SCpnt->cmnd[0]) { case UNMAP: sd_disable_discard(sdkp); break; case WRITE_SAME_16: case WRITE_SAME: if (SCpnt->cmnd[1] & 8) { /* UNMAP */ sd_disable_discard(sdkp); } else { sd_disable_write_same(sdkp); req->rq_flags |= RQF_QUIET; } break; } } break; default: break; } out: if (sdkp->device->type == TYPE_ZBC) good_bytes = sd_zbc_complete(SCpnt, good_bytes, &sshdr); SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt, "sd_done: completed %d of %d bytes\n", good_bytes, scsi_bufflen(SCpnt))); return good_bytes; } /* * spinup disk - called only in sd_revalidate_disk() */ static void sd_spinup_disk(struct scsi_disk *sdkp) { static const u8 cmd[10] = { TEST_UNIT_READY }; unsigned long spintime_expire = 0; int spintime, sense_valid = 0; unsigned int the_result; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { /* Do not retry Medium Not Present */ { .sense = UNIT_ATTENTION, .asc = 0x3A, .ascq = SCMD_FAILURE_ASCQ_ANY, .result = SAM_STAT_CHECK_CONDITION, }, { .sense = NOT_READY, .asc = 0x3A, .ascq = SCMD_FAILURE_ASCQ_ANY, .result = SAM_STAT_CHECK_CONDITION, }, /* Retry when scsi_status_is_good would return false 3 times */ { .result = SCMD_FAILURE_STAT_ANY, .allowed = 3, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; spintime = 0; /* Spin up drives, as required. Only do this at boot time */ /* Spinup needs to be done for module loads too. */ do { bool media_was_present = sdkp->media_present; scsi_failures_reset_retries(&failures); the_result = scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { /* * If the drive has indicated to us that it doesn't * have any media in it, don't bother with any more * polling. */ if (media_not_present(sdkp, &sshdr)) { if (media_was_present) sd_printk(KERN_NOTICE, sdkp, "Media removed, stopped polling\n"); return; } sense_valid = scsi_sense_valid(&sshdr); } if (!scsi_status_is_check_condition(the_result)) { /* no sense, TUR either succeeded or failed * with a status error */ if(!spintime && !scsi_status_is_good(the_result)) { sd_print_result(sdkp, "Test Unit Ready failed", the_result); } break; } /* * The device does not want the automatic start to be issued. */ if (sdkp->device->no_start_on_add) break; if (sense_valid && sshdr.sense_key == NOT_READY) { if (sshdr.asc == 4 && sshdr.ascq == 3) break; /* manual intervention required */ if (sshdr.asc == 4 && sshdr.ascq == 0xb) break; /* standby */ if (sshdr.asc == 4 && sshdr.ascq == 0xc) break; /* unavailable */ if (sshdr.asc == 4 && sshdr.ascq == 0x1b) break; /* sanitize in progress */ if (sshdr.asc == 4 && sshdr.ascq == 0x24) break; /* depopulation in progress */ if (sshdr.asc == 4 && sshdr.ascq == 0x25) break; /* depopulation restoration in progress */ /* * Issue command to spin up drive when not ready */ if (!spintime) { /* Return immediately and start spin cycle */ const u8 start_cmd[10] = { [0] = START_STOP, [1] = 1, [4] = sdkp->device->start_stop_pwr_cond ? 0x11 : 1, }; sd_printk(KERN_NOTICE, sdkp, "Spinning up disk..."); scsi_execute_cmd(sdkp->device, start_cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); spintime_expire = jiffies + 100 * HZ; spintime = 1; } /* Wait 1 second for next try */ msleep(1000); printk(KERN_CONT "."); /* * Wait for USB flash devices with slow firmware. * Yes, this sense key/ASC combination shouldn't * occur here. It's characteristic of these devices. */ } else if (sense_valid && sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x28) { if (!spintime) { spintime_expire = jiffies + 5 * HZ; spintime = 1; } /* Wait 1 second for next try */ msleep(1000); } else { /* we don't understand the sense code, so it's * probably pointless to loop */ if(!spintime) { sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready\n"); sd_print_sense_hdr(sdkp, &sshdr); } break; } } while (spintime && time_before_eq(jiffies, spintime_expire)); if (spintime) { if (scsi_status_is_good(the_result)) printk(KERN_CONT "ready\n"); else printk(KERN_CONT "not responding...\n"); } } /* * Determine whether disk supports Data Integrity Field. */ static int sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdp = sdkp->device; u8 type; if (scsi_device_protection(sdp) == 0 || (buffer[12] & 1) == 0) { sdkp->protection_type = 0; return 0; } type = ((buffer[12] >> 1) & 7) + 1; /* P_TYPE 0 = Type 1 */ if (type > T10_PI_TYPE3_PROTECTION) { sd_printk(KERN_ERR, sdkp, "formatted with unsupported" \ " protection type %u. Disabling disk!\n", type); sdkp->protection_type = 0; return -ENODEV; } sdkp->protection_type = type; return 0; } static void sd_config_protection(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_device *sdp = sdkp->device; if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY)) sd_dif_config_host(sdkp, lim); if (!sdkp->protection_type) return; if (!scsi_host_dif_capable(sdp->host, sdkp->protection_type)) { sd_first_printk(KERN_NOTICE, sdkp, "Disabling DIF Type %u protection\n", sdkp->protection_type); sdkp->protection_type = 0; } sd_first_printk(KERN_NOTICE, sdkp, "Enabling DIF Type %u protection\n", sdkp->protection_type); } static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp, struct scsi_sense_hdr *sshdr, int sense_valid, int the_result) { if (sense_valid) sd_print_sense_hdr(sdkp, sshdr); else sd_printk(KERN_NOTICE, sdkp, "Sense not available.\n"); /* * Set dirty bit for removable devices if not ready - * sometimes drives will not report this properly. */ if (sdp->removable && sense_valid && sshdr->sense_key == NOT_READY) set_media_not_present(sdkp); /* * We used to set media_present to 0 here to indicate no media * in the drive, but some drives fail read capacity even with * media present, so we can't do that. */ sdkp->capacity = 0; /* unknown mapped to zero - as usual */ } #define RC16_LEN 32 #if RC16_LEN > SD_BUF_SIZE #error RC16_LEN must not be more than SD_BUF_SIZE #endif #define READ_CAPACITY_RETRIES_ON_RESET 10 static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, struct queue_limits *lim, unsigned char *buffer) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; int sense_valid = 0; int the_result; int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET; unsigned int alignment; unsigned long long lba; unsigned sector_size; if (sdp->no_read_capacity_16) return -EINVAL; do { memset(cmd, 0, 16); cmd[0] = SERVICE_ACTION_IN_16; cmd[1] = SAI_READ_CAPACITY_16; cmd[13] = RC16_LEN; memset(buffer, 0, RC16_LEN); the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, buffer, RC16_LEN, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { if (media_not_present(sdkp, &sshdr)) return -ENODEV; sense_valid = scsi_sense_valid(&sshdr); if (sense_valid && sshdr.sense_key == ILLEGAL_REQUEST && (sshdr.asc == 0x20 || sshdr.asc == 0x24) && sshdr.ascq == 0x00) /* Invalid Command Operation Code or * Invalid Field in CDB, just retry * silently with RC10 */ return -EINVAL; if (sense_valid && sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x29 && sshdr.ascq == 0x00) /* Device reset might occur several times, * give it one more chance */ if (--reset_retries > 0) continue; } retries--; } while (the_result && retries); if (the_result) { sd_print_result(sdkp, "Read Capacity(16) failed", the_result); read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result); return -EINVAL; } sector_size = get_unaligned_be32(&buffer[8]); lba = get_unaligned_be64(&buffer[0]); if (sd_read_protection_type(sdkp, buffer) < 0) { sdkp->capacity = 0; return -ENODEV; } /* Logical blocks per physical block exponent */ sdkp->physical_block_size = (1 << (buffer[13] & 0xf)) * sector_size; /* RC basis */ sdkp->rc_basis = (buffer[12] >> 4) & 0x3; /* Lowest aligned logical block */ alignment = ((buffer[14] & 0x3f) << 8 | buffer[15]) * sector_size; lim->alignment_offset = alignment; if (alignment && sdkp->first_scan) sd_printk(KERN_NOTICE, sdkp, "physical block alignment offset: %u\n", alignment); if (buffer[14] & 0x80) { /* LBPME */ sdkp->lbpme = 1; if (buffer[14] & 0x40) /* LBPRZ */ sdkp->lbprz = 1; } sdkp->capacity = lba + 1; return sector_size; } static int read_capacity_10(struct scsi_disk *sdkp, struct scsi_device *sdp, unsigned char *buffer) { static const u8 cmd[10] = { READ_CAPACITY }; struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { /* Do not retry Medium Not Present */ { .sense = UNIT_ATTENTION, .asc = 0x3A, .result = SAM_STAT_CHECK_CONDITION, }, { .sense = NOT_READY, .asc = 0x3A, .result = SAM_STAT_CHECK_CONDITION, }, /* Device reset might occur several times so retry a lot */ { .sense = UNIT_ATTENTION, .asc = 0x29, .allowed = READ_CAPACITY_RETRIES_ON_RESET, .result = SAM_STAT_CHECK_CONDITION, }, /* Any other error not listed above retry 3 times */ { .result = SCMD_FAILURE_RESULT_ANY, .allowed = 3, }, {} }; struct scsi_failures failures = { .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .failures = &failures, }; int sense_valid = 0; int the_result; sector_t lba; unsigned sector_size; memset(buffer, 0, 8); the_result = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, buffer, 8, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (the_result > 0) { sense_valid = scsi_sense_valid(&sshdr); if (media_not_present(sdkp, &sshdr)) return -ENODEV; } if (the_result) { sd_print_result(sdkp, "Read Capacity(10) failed", the_result); read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result); return -EINVAL; } sector_size = get_unaligned_be32(&buffer[4]); lba = get_unaligned_be32(&buffer[0]); if (sdp->no_read_capacity_16 && (lba == 0xffffffff)) { /* Some buggy (usb cardreader) devices return an lba of 0xffffffff when the want to report a size of 0 (with which they really mean no media is present) */ sdkp->capacity = 0; sdkp->physical_block_size = sector_size; return sector_size; } sdkp->capacity = lba + 1; sdkp->physical_block_size = sector_size; return sector_size; } static int sd_try_rc16_first(struct scsi_device *sdp) { if (sdp->host->max_cmd_len < 16) return 0; if (sdp->try_rc_10_first) return 0; if (sdp->scsi_level > SCSI_SPC_2) return 1; if (scsi_device_protection(sdp)) return 1; return 0; } /* * read disk capacity */ static void sd_read_capacity(struct scsi_disk *sdkp, struct queue_limits *lim, unsigned char *buffer) { int sector_size; struct scsi_device *sdp = sdkp->device; if (sd_try_rc16_first(sdp)) { sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size == -ENODEV) return; if (sector_size < 0) sector_size = read_capacity_10(sdkp, sdp, buffer); if (sector_size < 0) return; } else { sector_size = read_capacity_10(sdkp, sdp, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size < 0) return; if ((sizeof(sdkp->capacity) > 4) && (sdkp->capacity > 0xffffffffULL)) { int old_sector_size = sector_size; sd_printk(KERN_NOTICE, sdkp, "Very big device. " "Trying to use READ CAPACITY(16).\n"); sector_size = read_capacity_16(sdkp, sdp, lim, buffer); if (sector_size < 0) { sd_printk(KERN_NOTICE, sdkp, "Using 0xffffffff as device size\n"); sdkp->capacity = 1 + (sector_t) 0xffffffff; sector_size = old_sector_size; goto got_data; } /* Remember that READ CAPACITY(16) succeeded */ sdp->try_rc_10_first = 0; } } /* Some devices are known to return the total number of blocks, * not the highest block number. Some devices have versions * which do this and others which do not. Some devices we might * suspect of doing this but we don't know for certain. * * If we know the reported capacity is wrong, decrement it. If * we can only guess, then assume the number of blocks is even * (usually true but not always) and err on the side of lowering * the capacity. */ if (sdp->fix_capacity || (sdp->guess_capacity && (sdkp->capacity & 0x01))) { sd_printk(KERN_INFO, sdkp, "Adjusting the sector count " "from its reported value: %llu\n", (unsigned long long) sdkp->capacity); --sdkp->capacity; } got_data: if (sector_size == 0) { sector_size = 512; sd_printk(KERN_NOTICE, sdkp, "Sector size 0 reported, " "assuming 512.\n"); } if (sector_size != 512 && sector_size != 1024 && sector_size != 2048 && sector_size != 4096) { sd_printk(KERN_NOTICE, sdkp, "Unsupported sector size %d.\n", sector_size); /* * The user might want to re-format the drive with * a supported sectorsize. Once this happens, it * would be relatively trivial to set the thing up. * For this reason, we leave the thing in the table. */ sdkp->capacity = 0; /* * set a bogus sector size so the normal read/write * logic in the block layer will eventually refuse any * request on this device without tripping over power * of two sector size assumptions */ sector_size = 512; } lim->logical_block_size = sector_size; lim->physical_block_size = sdkp->physical_block_size; sdkp->device->sector_size = sector_size; if (sdkp->capacity > 0xffffffff) sdp->use_16_for_rw = 1; } /* * Print disk capacity */ static void sd_print_capacity(struct scsi_disk *sdkp, sector_t old_capacity) { int sector_size = sdkp->device->sector_size; char cap_str_2[10], cap_str_10[10]; if (!sdkp->first_scan && old_capacity == sdkp->capacity) return; string_get_size(sdkp->capacity, sector_size, STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); string_get_size(sdkp->capacity, sector_size, STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); sd_printk(KERN_NOTICE, sdkp, "%llu %d-byte logical blocks: (%s/%s)\n", (unsigned long long)sdkp->capacity, sector_size, cap_str_10, cap_str_2); if (sdkp->physical_block_size != sector_size) sd_printk(KERN_NOTICE, sdkp, "%u-byte physical blocks\n", sdkp->physical_block_size); } /* called with buffer of length 512 */ static inline int sd_do_mode_sense(struct scsi_disk *sdkp, int dbd, int modepage, unsigned char *buffer, int len, struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { /* * If we must use MODE SENSE(10), make sure that the buffer length * is at least 8 bytes so that the mode sense header fits. */ if (sdkp->device->use_10_for_ms && len < 8) len = 8; return scsi_mode_sense(sdkp->device, dbd, modepage, 0, buffer, len, SD_TIMEOUT, sdkp->max_retries, data, sshdr); } /* * read write protect setting, if possible - called only in sd_revalidate_disk() * called with buffer of length SD_BUF_SIZE */ static void sd_read_write_protect_flag(struct scsi_disk *sdkp, unsigned char *buffer) { int res; struct scsi_device *sdp = sdkp->device; struct scsi_mode_data data; int old_wp = sdkp->write_prot; set_disk_ro(sdkp->disk, 0); if (sdp->skip_ms_page_3f) { sd_first_printk(KERN_NOTICE, sdkp, "Assuming Write Enabled\n"); return; } if (sdp->use_192_bytes_for_3f) { res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 192, &data, NULL); } else { /* * First attempt: ask for all pages (0x3F), but only 4 bytes. * We have to start carefully: some devices hang if we ask * for more than is available. */ res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 4, &data, NULL); /* * Second attempt: ask for page 0 When only page 0 is * implemented, a request for page 3F may return Sense Key * 5: Illegal Request, Sense Code 24: Invalid field in * CDB. */ if (res < 0) res = sd_do_mode_sense(sdkp, 0, 0, buffer, 4, &data, NULL); /* * Third attempt: ask 255 bytes, as we did earlier. */ if (res < 0) res = sd_do_mode_sense(sdkp, 0, 0x3F, buffer, 255, &data, NULL); } if (res < 0) { sd_first_printk(KERN_WARNING, sdkp, "Test WP failed, assume Write Enabled\n"); } else { sdkp->write_prot = ((data.device_specific & 0x80) != 0); set_disk_ro(sdkp->disk, sdkp->write_prot); if (sdkp->first_scan || old_wp != sdkp->write_prot) { sd_printk(KERN_NOTICE, sdkp, "Write Protect is %s\n", sdkp->write_prot ? "on" : "off"); sd_printk(KERN_DEBUG, sdkp, "Mode Sense: %4ph\n", buffer); } } } /* * sd_read_cache_type - called only from sd_revalidate_disk() * called with buffer of length SD_BUF_SIZE */ static void sd_read_cache_type(struct scsi_disk *sdkp, unsigned char *buffer) { int len = 0, res; struct scsi_device *sdp = sdkp->device; int dbd; int modepage; int first_len; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; int old_wce = sdkp->WCE; int old_rcd = sdkp->RCD; int old_dpofua = sdkp->DPOFUA; if (sdkp->cache_override) return; first_len = 4; if (sdp->skip_ms_page_8) { if (sdp->type == TYPE_RBC) goto defaults; else { if (sdp->skip_ms_page_3f) goto defaults; modepage = 0x3F; if (sdp->use_192_bytes_for_3f) first_len = 192; dbd = 0; } } else if (sdp->type == TYPE_RBC) { modepage = 6; dbd = 8; } else { modepage = 8; dbd = 0; } /* cautiously ask */ res = sd_do_mode_sense(sdkp, dbd, modepage, buffer, first_len, &data, &sshdr); if (res < 0) goto bad_sense; if (!data.header_length) { modepage = 6; first_len = 0; sd_first_printk(KERN_ERR, sdkp, "Missing header in MODE_SENSE response\n"); } /* that went OK, now ask for the proper length */ len = data.length; /* * We're only interested in the first three bytes, actually. * But the data cache page is defined for the first 20. */ if (len < 3) goto bad_sense; else if (len > SD_BUF_SIZE) { sd_first_printk(KERN_NOTICE, sdkp, "Truncating mode parameter " "data from %d to %d bytes\n", len, SD_BUF_SIZE); len = SD_BUF_SIZE; } if (modepage == 0x3F && sdp->use_192_bytes_for_3f) len = 192; /* Get the data */ if (len > first_len) res = sd_do_mode_sense(sdkp, dbd, modepage, buffer, len, &data, &sshdr); if (!res) { int offset = data.header_length + data.block_descriptor_length; while (offset < len) { u8 page_code = buffer[offset] & 0x3F; u8 spf = buffer[offset] & 0x40; if (page_code == 8 || page_code == 6) { /* We're interested only in the first 3 bytes. */ if (len - offset <= 2) { sd_first_printk(KERN_ERR, sdkp, "Incomplete mode parameter " "data\n"); goto defaults; } else { modepage = page_code; goto Page_found; } } else { /* Go to the next page */ if (spf && len - offset > 3) offset += 4 + (buffer[offset+2] << 8) + buffer[offset+3]; else if (!spf && len - offset > 1) offset += 2 + buffer[offset+1]; else { sd_first_printk(KERN_ERR, sdkp, "Incomplete mode " "parameter data\n"); goto defaults; } } } sd_first_printk(KERN_WARNING, sdkp, "No Caching mode page found\n"); goto defaults; Page_found: if (modepage == 8) { sdkp->WCE = ((buffer[offset + 2] & 0x04) != 0); sdkp->RCD = ((buffer[offset + 2] & 0x01) != 0); } else { sdkp->WCE = ((buffer[offset + 2] & 0x01) == 0); sdkp->RCD = 0; } sdkp->DPOFUA = (data.device_specific & 0x10) != 0; if (sdp->broken_fua) { sd_first_printk(KERN_NOTICE, sdkp, "Disabling FUA\n"); sdkp->DPOFUA = 0; } else if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw && !sdkp->device->use_16_for_rw) { sd_first_printk(KERN_NOTICE, sdkp, "Uses READ/WRITE(6), disabling FUA\n"); sdkp->DPOFUA = 0; } /* No cache flush allowed for write protected devices */ if (sdkp->WCE && sdkp->write_prot) sdkp->WCE = 0; if (sdkp->first_scan || old_wce != sdkp->WCE || old_rcd != sdkp->RCD || old_dpofua != sdkp->DPOFUA) sd_printk(KERN_NOTICE, sdkp, "Write cache: %s, read cache: %s, %s\n", sdkp->WCE ? "enabled" : "disabled", sdkp->RCD ? "disabled" : "enabled", sdkp->DPOFUA ? "supports DPO and FUA" : "doesn't support DPO or FUA"); return; } bad_sense: if (res == -EIO && scsi_sense_valid(&sshdr) && sshdr.sense_key == ILLEGAL_REQUEST && sshdr.asc == 0x24 && sshdr.ascq == 0x0) /* Invalid field in CDB */ sd_first_printk(KERN_NOTICE, sdkp, "Cache data unavailable\n"); else sd_first_printk(KERN_ERR, sdkp, "Asking for cache data failed\n"); defaults: if (sdp->wce_default_on) { sd_first_printk(KERN_NOTICE, sdkp, "Assuming drive cache: write back\n"); sdkp->WCE = 1; } else { sd_first_printk(KERN_WARNING, sdkp, "Assuming drive cache: write through\n"); sdkp->WCE = 0; } sdkp->RCD = 0; sdkp->DPOFUA = 0; } static bool sd_is_perm_stream(struct scsi_disk *sdkp, unsigned int stream_id) { u8 cdb[16] = { SERVICE_ACTION_IN_16, SAI_GET_STREAM_STATUS }; struct { struct scsi_stream_status_header h; struct scsi_stream_status s; } buf; struct scsi_device *sdev = sdkp->device; struct scsi_sense_hdr sshdr; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; int res; put_unaligned_be16(stream_id, &cdb[4]); put_unaligned_be32(sizeof(buf), &cdb[10]); res = scsi_execute_cmd(sdev, cdb, REQ_OP_DRV_IN, &buf, sizeof(buf), SD_TIMEOUT, sdkp->max_retries, &exec_args); if (res < 0) return false; if (scsi_status_is_check_condition(res) && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); if (res) return false; if (get_unaligned_be32(&buf.h.len) < sizeof(struct scsi_stream_status)) return false; return buf.h.stream_status[0].perm; } static void sd_read_io_hints(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdp = sdkp->device; const struct scsi_io_group_descriptor *desc, *start, *end; u16 permanent_stream_count_old; struct scsi_sense_hdr sshdr; struct scsi_mode_data data; int res; if (sdp->sdev_bflags & BLIST_SKIP_IO_HINTS) return; res = scsi_mode_sense(sdp, /*dbd=*/0x8, /*modepage=*/0x0a, /*subpage=*/0x05, buffer, SD_BUF_SIZE, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (res < 0) return; start = (void *)buffer + data.header_length + 16; end = (void *)buffer + ALIGN_DOWN(data.header_length + data.length, sizeof(*end)); /* * From "SBC-5 Constrained Streams with Data Lifetimes": Device severs * should assign the lowest numbered stream identifiers to permanent * streams. */ for (desc = start; desc < end; desc++) if (!desc->st_enble || !sd_is_perm_stream(sdkp, desc - start)) break; permanent_stream_count_old = sdkp->permanent_stream_count; sdkp->permanent_stream_count = desc - start; if (sdkp->rscs && sdkp->permanent_stream_count < 2) sd_printk(KERN_INFO, sdkp, "Unexpected: RSCS has been set and the permanent stream count is %u\n", sdkp->permanent_stream_count); else if (sdkp->permanent_stream_count != permanent_stream_count_old) sd_printk(KERN_INFO, sdkp, "permanent stream count = %d\n", sdkp->permanent_stream_count); } /* * The ATO bit indicates whether the DIF application tag is available * for use by the operating system. */ static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer) { int res, offset; struct scsi_device *sdp = sdkp->device; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return; if (sdkp->protection_type == 0) return; res = scsi_mode_sense(sdp, 1, 0x0a, 0, buffer, 36, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr); if (res < 0 || !data.header_length || data.length < 6) { sd_first_printk(KERN_WARNING, sdkp, "getting Control mode page failed, assume no ATO\n"); if (res == -EIO && scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); return; } offset = data.header_length + data.block_descriptor_length; if ((buffer[offset] & 0x3f) != 0x0a) { sd_first_printk(KERN_ERR, sdkp, "ATO Got wrong page\n"); return; } if ((buffer[offset + 5] & 0x80) == 0) return; sdkp->ATO = 1; return; } static unsigned int sd_discard_mode(struct scsi_disk *sdkp) { if (!sdkp->lbpme) return SD_LBP_FULL; if (!sdkp->lbpvpd) { /* LBP VPD page not provided */ if (sdkp->max_unmap_blocks) return SD_LBP_UNMAP; return SD_LBP_WS16; } /* LBP VPD page tells us what to use */ if (sdkp->lbpu && sdkp->max_unmap_blocks) return SD_LBP_UNMAP; if (sdkp->lbpws) return SD_LBP_WS16; if (sdkp->lbpws10) return SD_LBP_WS10; return SD_LBP_DISABLE; } /* * Query disk device for preferred I/O sizes. */ static void sd_read_block_limits(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_vpd *vpd; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb0); if (!vpd || vpd->len < 16) goto out; sdkp->min_xfer_blocks = get_unaligned_be16(&vpd->data[6]); sdkp->max_xfer_blocks = get_unaligned_be32(&vpd->data[8]); sdkp->opt_xfer_blocks = get_unaligned_be32(&vpd->data[12]); if (vpd->len >= 64) { unsigned int lba_count, desc_count; sdkp->max_ws_blocks = (u32)get_unaligned_be64(&vpd->data[36]); if (!sdkp->lbpme) goto config_atomic; lba_count = get_unaligned_be32(&vpd->data[20]); desc_count = get_unaligned_be32(&vpd->data[24]); if (lba_count && desc_count) sdkp->max_unmap_blocks = lba_count; sdkp->unmap_granularity = get_unaligned_be32(&vpd->data[28]); if (vpd->data[32] & 0x80) sdkp->unmap_alignment = get_unaligned_be32(&vpd->data[32]) & ~(1 << 31); config_atomic: sdkp->max_atomic = get_unaligned_be32(&vpd->data[44]); sdkp->atomic_alignment = get_unaligned_be32(&vpd->data[48]); sdkp->atomic_granularity = get_unaligned_be32(&vpd->data[52]); sdkp->max_atomic_with_boundary = get_unaligned_be32(&vpd->data[56]); sdkp->max_atomic_boundary = get_unaligned_be32(&vpd->data[60]); sd_config_atomic(sdkp, lim); } out: rcu_read_unlock(); } /* Parse the Block Limits Extension VPD page (0xb7) */ static void sd_read_block_limits_ext(struct scsi_disk *sdkp) { struct scsi_vpd *vpd; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb7); if (vpd && vpd->len >= 2) sdkp->rscs = vpd->data[5] & 1; rcu_read_unlock(); } /* Query block device characteristics */ static void sd_read_block_characteristics(struct scsi_disk *sdkp, struct queue_limits *lim) { struct scsi_vpd *vpd; u16 rot; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb1); if (!vpd || vpd->len <= 8) { rcu_read_unlock(); return; } rot = get_unaligned_be16(&vpd->data[4]); sdkp->zoned = (vpd->data[8] >> 4) & 3; rcu_read_unlock(); if (rot == 1) lim->features &= ~(BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (!sdkp->first_scan) return; if (sdkp->device->type == TYPE_ZBC) sd_printk(KERN_NOTICE, sdkp, "Host-managed zoned block device\n"); else if (sdkp->zoned == 1) sd_printk(KERN_NOTICE, sdkp, "Host-aware SMR disk used as regular disk\n"); else if (sdkp->zoned == 2) sd_printk(KERN_NOTICE, sdkp, "Drive-managed SMR disk\n"); } /** * sd_read_block_provisioning - Query provisioning VPD page * @sdkp: disk to query */ static void sd_read_block_provisioning(struct scsi_disk *sdkp) { struct scsi_vpd *vpd; if (sdkp->lbpme == 0) return; rcu_read_lock(); vpd = rcu_dereference(sdkp->device->vpd_pgb2); if (!vpd || vpd->len < 8) { rcu_read_unlock(); return; } sdkp->lbpvpd = 1; sdkp->lbpu = (vpd->data[5] >> 7) & 1; /* UNMAP */ sdkp->lbpws = (vpd->data[5] >> 6) & 1; /* WRITE SAME(16) w/ UNMAP */ sdkp->lbpws10 = (vpd->data[5] >> 5) & 1; /* WRITE SAME(10) w/ UNMAP */ rcu_read_unlock(); } static void sd_read_write_same(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdev = sdkp->device; if (sdev->host->no_write_same) { sdev->no_write_same = 1; return; } if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, INQUIRY, 0) < 0) { struct scsi_vpd *vpd; sdev->no_report_opcodes = 1; /* Disable WRITE SAME if REPORT SUPPORTED OPERATION * CODES is unsupported and the device has an ATA * Information VPD page (SAT). */ rcu_read_lock(); vpd = rcu_dereference(sdev->vpd_pg89); if (vpd) sdev->no_write_same = 1; rcu_read_unlock(); } if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME_16, 0) == 1) sdkp->ws16 = 1; if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME, 0) == 1) sdkp->ws10 = 1; } static void sd_read_security(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdev = sdkp->device; if (!sdev->security_supported) return; if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, SECURITY_PROTOCOL_IN, 0) == 1 && scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, SECURITY_PROTOCOL_OUT, 0) == 1) sdkp->security = 1; } static inline sector_t sd64_to_sectors(struct scsi_disk *sdkp, u8 *buf) { return logical_to_sectors(sdkp->device, get_unaligned_be64(buf)); } /** * sd_read_cpr - Query concurrent positioning ranges * @sdkp: disk to query */ static void sd_read_cpr(struct scsi_disk *sdkp) { struct blk_independent_access_ranges *iars = NULL; unsigned char *buffer = NULL; unsigned int nr_cpr = 0; int i, vpd_len, buf_len = SD_BUF_SIZE; u8 *desc; /* * We need to have the capacity set first for the block layer to be * able to check the ranges. */ if (sdkp->first_scan) return; if (!sdkp->capacity) goto out; /* * Concurrent Positioning Ranges VPD: there can be at most 256 ranges, * leading to a maximum page size of 64 + 256*32 bytes. */ buf_len = 64 + 256*32; buffer = kmalloc(buf_len, GFP_KERNEL); if (!buffer || scsi_get_vpd_page(sdkp->device, 0xb9, buffer, buf_len)) goto out; /* We must have at least a 64B header and one 32B range descriptor */ vpd_len = get_unaligned_be16(&buffer[2]) + 4; if (vpd_len > buf_len || vpd_len < 64 + 32 || (vpd_len & 31)) { sd_printk(KERN_ERR, sdkp, "Invalid Concurrent Positioning Ranges VPD page\n"); goto out; } nr_cpr = (vpd_len - 64) / 32; if (nr_cpr == 1) { nr_cpr = 0; goto out; } iars = disk_alloc_independent_access_ranges(sdkp->disk, nr_cpr); if (!iars) { nr_cpr = 0; goto out; } desc = &buffer[64]; for (i = 0; i < nr_cpr; i++, desc += 32) { if (desc[0] != i) { sd_printk(KERN_ERR, sdkp, "Invalid Concurrent Positioning Range number\n"); nr_cpr = 0; break; } iars->ia_range[i].sector = sd64_to_sectors(sdkp, desc + 8); iars->ia_range[i].nr_sectors = sd64_to_sectors(sdkp, desc + 16); } out: disk_set_independent_access_ranges(sdkp->disk, iars); if (nr_cpr && sdkp->nr_actuators != nr_cpr) { sd_printk(KERN_NOTICE, sdkp, "%u concurrent positioning ranges\n", nr_cpr); sdkp->nr_actuators = nr_cpr; } kfree(buffer); } static bool sd_validate_min_xfer_size(struct scsi_disk *sdkp) { struct scsi_device *sdp = sdkp->device; unsigned int min_xfer_bytes = logical_to_bytes(sdp, sdkp->min_xfer_blocks); if (sdkp->min_xfer_blocks == 0) return false; if (min_xfer_bytes & (sdkp->physical_block_size - 1)) { sd_first_printk(KERN_WARNING, sdkp, "Preferred minimum I/O size %u bytes not a " \ "multiple of physical block size (%u bytes)\n", min_xfer_bytes, sdkp->physical_block_size); sdkp->min_xfer_blocks = 0; return false; } sd_first_printk(KERN_INFO, sdkp, "Preferred minimum I/O size %u bytes\n", min_xfer_bytes); return true; } /* * Determine the device's preferred I/O size for reads and writes * unless the reported value is unreasonably small, large, not a * multiple of the physical block size, or simply garbage. */ static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp, unsigned int dev_max) { struct scsi_device *sdp = sdkp->device; unsigned int opt_xfer_bytes = logical_to_bytes(sdp, sdkp->opt_xfer_blocks); unsigned int min_xfer_bytes = logical_to_bytes(sdp, sdkp->min_xfer_blocks); if (sdkp->opt_xfer_blocks == 0) return false; if (sdkp->opt_xfer_blocks > dev_max) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u logical blocks " \ "> dev_max (%u logical blocks)\n", sdkp->opt_xfer_blocks, dev_max); return false; } if (sdkp->opt_xfer_blocks > SD_DEF_XFER_BLOCKS) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u logical blocks " \ "> sd driver limit (%u logical blocks)\n", sdkp->opt_xfer_blocks, SD_DEF_XFER_BLOCKS); return false; } if (opt_xfer_bytes < PAGE_SIZE) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes < " \ "PAGE_SIZE (%u bytes)\n", opt_xfer_bytes, (unsigned int)PAGE_SIZE); return false; } if (min_xfer_bytes && opt_xfer_bytes % min_xfer_bytes) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes not a " \ "multiple of preferred minimum block " \ "size (%u bytes)\n", opt_xfer_bytes, min_xfer_bytes); return false; } if (opt_xfer_bytes & (sdkp->physical_block_size - 1)) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes not a " \ "multiple of physical block size (%u bytes)\n", opt_xfer_bytes, sdkp->physical_block_size); return false; } sd_first_printk(KERN_INFO, sdkp, "Optimal transfer size %u bytes\n", opt_xfer_bytes); return true; } static void sd_read_block_zero(struct scsi_disk *sdkp) { struct scsi_device *sdev = sdkp->device; unsigned int buf_len = sdev->sector_size; u8 *buffer, cmd[16] = { }; buffer = kmalloc(buf_len, GFP_KERNEL); if (!buffer) return; if (sdev->use_16_for_rw) { cmd[0] = READ_16; put_unaligned_be64(0, &cmd[2]); /* Logical block address 0 */ put_unaligned_be32(1, &cmd[10]);/* Transfer 1 logical block */ } else { cmd[0] = READ_10; put_unaligned_be32(0, &cmd[2]); /* Logical block address 0 */ put_unaligned_be16(1, &cmd[7]); /* Transfer 1 logical block */ } scsi_execute_cmd(sdkp->device, cmd, REQ_OP_DRV_IN, buffer, buf_len, SD_TIMEOUT, sdkp->max_retries, NULL); kfree(buffer); } /** * sd_revalidate_disk - called the first time a new disk is seen, * performs disk spin up, read_capacity, etc. * @disk: struct gendisk we care about **/ static int sd_revalidate_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; sector_t old_capacity = sdkp->capacity; struct queue_limits lim; unsigned char *buffer; unsigned int dev_max; int err; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); /* * If the device is offline, don't try and read capacity or any * of the other niceties. */ if (!scsi_device_online(sdp)) goto out; buffer = kmalloc(SD_BUF_SIZE, GFP_KERNEL); if (!buffer) { sd_printk(KERN_WARNING, sdkp, "sd_revalidate_disk: Memory " "allocation failure.\n"); goto out; } sd_spinup_disk(sdkp); lim = queue_limits_start_update(sdkp->disk->queue); /* * Without media there is no reason to ask; moreover, some devices * react badly if we do. */ if (sdkp->media_present) { sd_read_capacity(sdkp, &lim, buffer); /* * Some USB/UAS devices return generic values for mode pages * until the media has been accessed. Trigger a READ operation * to force the device to populate mode pages. */ if (sdp->read_before_ms) sd_read_block_zero(sdkp); /* * set the default to rotational. All non-rotational devices * support the block characteristics VPD page, which will * cause this to be updated correctly and any device which * doesn't support it should be treated as rotational. */ lim.features |= (BLK_FEAT_ROTATIONAL | BLK_FEAT_ADD_RANDOM); if (scsi_device_supports_vpd(sdp)) { sd_read_block_provisioning(sdkp); sd_read_block_limits(sdkp, &lim); sd_read_block_limits_ext(sdkp); sd_read_block_characteristics(sdkp, &lim); sd_zbc_read_zones(sdkp, &lim, buffer); } sd_config_discard(sdkp, &lim, sd_discard_mode(sdkp)); sd_print_capacity(sdkp, old_capacity); sd_read_write_protect_flag(sdkp, buffer); sd_read_cache_type(sdkp, buffer); sd_read_io_hints(sdkp, buffer); sd_read_app_tag_own(sdkp, buffer); sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); sd_config_protection(sdkp, &lim); } /* * We now have all cache related info, determine how we deal * with flush requests. */ sd_set_flush_flag(sdkp, &lim); /* Initial block count limit based on CDB TRANSFER LENGTH field size. */ dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS; /* Some devices report a maximum block count for READ/WRITE requests. */ dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks); lim.max_dev_sectors = logical_to_sectors(sdp, dev_max); if (sd_validate_min_xfer_size(sdkp)) lim.io_min = logical_to_bytes(sdp, sdkp->min_xfer_blocks); else lim.io_min = 0; /* * Limit default to SCSI host optimal sector limit if set. There may be * an impact on performance for when the size of a request exceeds this * host limit. */ lim.io_opt = sdp->host->opt_sectors << SECTOR_SHIFT; if (sd_validate_opt_xfer_size(sdkp, dev_max)) { lim.io_opt = min_not_zero(lim.io_opt, logical_to_bytes(sdp, sdkp->opt_xfer_blocks)); } sdkp->first_scan = 0; set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity)); sd_config_write_same(sdkp, &lim); kfree(buffer); blk_mq_freeze_queue(sdkp->disk->queue); err = queue_limits_commit_update(sdkp->disk->queue, &lim); blk_mq_unfreeze_queue(sdkp->disk->queue); if (err) return err; /* * Query concurrent positioning ranges after * queue_limits_commit_update() unlocked q->limits_lock to avoid * deadlock with q->sysfs_dir_lock and q->sysfs_lock. */ if (sdkp->media_present && scsi_device_supports_vpd(sdp)) sd_read_cpr(sdkp); /* * For a zoned drive, revalidating the zones can be done only once * the gendisk capacity is set. So if this fails, set back the gendisk * capacity to 0. */ if (sd_zbc_revalidate_zones(sdkp)) set_capacity_and_notify(disk, 0); out: return 0; } /** * sd_unlock_native_capacity - unlock native capacity * @disk: struct gendisk to set capacity for * * Block layer calls this function if it detects that partitions * on @disk reach beyond the end of the device. If the SCSI host * implements ->unlock_native_capacity() method, it's invoked to * give it a chance to adjust the device capacity. * * CONTEXT: * Defined by block layer. Might sleep. */ static void sd_unlock_native_capacity(struct gendisk *disk) { struct scsi_device *sdev = scsi_disk(disk)->device; if (sdev->host->hostt->unlock_native_capacity) sdev->host->hostt->unlock_native_capacity(sdev); } /** * sd_format_disk_name - format disk name * @prefix: name prefix - ie. "sd" for SCSI disks * @index: index of the disk to format name for * @buf: output buffer * @buflen: length of the output buffer * * SCSI disk names starts at sda. The 26th device is sdz and the * 27th is sdaa. The last one for two lettered suffix is sdzz * which is followed by sdaaa. * * This is basically 26 base counting with one extra 'nil' entry * at the beginning from the second digit on and can be * determined using similar method as 26 base conversion with the * index shifted -1 after each digit is computed. * * CONTEXT: * Don't care. * * RETURNS: * 0 on success, -errno on failure. */ static int sd_format_disk_name(char *prefix, int index, char *buf, int buflen) { const int base = 'z' - 'a' + 1; char *begin = buf + strlen(prefix); char *end = buf + buflen; char *p; int unit; p = end - 1; *p = '\0'; unit = base; do { if (p == begin) return -EINVAL; *--p = 'a' + (index % unit); index = (index / unit) - 1; } while (index >= 0); memmove(begin, p, end - p); memcpy(buf, prefix, strlen(prefix)); return 0; } /** * sd_probe - called during driver initialization and whenever a * new scsi device is attached to the system. It is called once * for each scsi device (not just disks) present. * @dev: pointer to device object * * Returns 0 if successful (or not interested in this scsi device * (e.g. scanner)); 1 when there is an error. * * Note: this function is invoked from the scsi mid-level. * This function sets up the mapping between a given * <host,channel,id,lun> (found in sdp) and new device name * (e.g. /dev/sda). More precisely it is the block device major * and minor number that is chosen here. * * Assume sd_probe is not re-entrant (for time being) * Also think about sd_probe() and sd_remove() running coincidentally. **/ static int sd_probe(struct device *dev) { struct scsi_device *sdp = to_scsi_device(dev); struct scsi_disk *sdkp; struct gendisk *gd; int index; int error; scsi_autopm_get_device(sdp); error = -ENODEV; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC && sdp->type != TYPE_MOD && sdp->type != TYPE_RBC) goto out; if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && sdp->type == TYPE_ZBC) { sdev_printk(KERN_WARNING, sdp, "Unsupported ZBC host-managed device.\n"); goto out; } SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp, "sd_probe\n")); error = -ENOMEM; sdkp = kzalloc(sizeof(*sdkp), GFP_KERNEL); if (!sdkp) goto out; gd = blk_mq_alloc_disk_for_queue(sdp->request_queue, &sd_bio_compl_lkclass); if (!gd) goto out_free; index = ida_alloc(&sd_index_ida, GFP_KERNEL); if (index < 0) { sdev_printk(KERN_WARNING, sdp, "sd_probe: memory exhausted.\n"); goto out_put; } error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN); if (error) { sdev_printk(KERN_WARNING, sdp, "SCSI disk (sd) name length exceeded.\n"); goto out_free_index; } sdkp->device = sdp; sdkp->disk = gd; sdkp->index = index; sdkp->max_retries = SD_MAX_RETRIES; atomic_set(&sdkp->openers, 0); atomic_set(&sdkp->device->ioerr_cnt, 0); if (!sdp->request_queue->rq_timeout) { if (sdp->type != TYPE_MOD) blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT); else blk_queue_rq_timeout(sdp->request_queue, SD_MOD_TIMEOUT); } device_initialize(&sdkp->disk_dev); sdkp->disk_dev.parent = get_device(dev); sdkp->disk_dev.class = &sd_disk_class; dev_set_name(&sdkp->disk_dev, "%s", dev_name(dev)); error = device_add(&sdkp->disk_dev); if (error) { put_device(&sdkp->disk_dev); goto out; } dev_set_drvdata(dev, sdkp); gd->major = sd_major((index & 0xf0) >> 4); gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00); gd->minors = SD_MINORS; gd->fops = &sd_fops; gd->private_data = sdkp; /* defaults, until the device tells us otherwise */ sdp->sector_size = 512; sdkp->capacity = 0; sdkp->media_present = 1; sdkp->write_prot = 0; sdkp->cache_override = 0; sdkp->WCE = 0; sdkp->RCD = 0; sdkp->ATO = 0; sdkp->first_scan = 1; sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS; sd_revalidate_disk(gd); if (sdp->removable) { gd->flags |= GENHD_FL_REMOVABLE; gd->events |= DISK_EVENT_MEDIA_CHANGE; gd->event_flags = DISK_EVENT_FLAG_POLL | DISK_EVENT_FLAG_UEVENT; } blk_pm_runtime_init(sdp->request_queue, dev); if (sdp->rpm_autosuspend) { pm_runtime_set_autosuspend_delay(dev, sdp->host->rpm_autosuspend_delay); } error = device_add_disk(dev, gd, NULL); if (error) { device_unregister(&sdkp->disk_dev); put_disk(gd); goto out; } if (sdkp->security) { sdkp->opal_dev = init_opal_dev(sdkp, &sd_sec_submit); if (sdkp->opal_dev) sd_printk(KERN_NOTICE, sdkp, "supports TCG Opal\n"); } sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n", sdp->removable ? "removable " : ""); scsi_autopm_put_device(sdp); return 0; out_free_index: ida_free(&sd_index_ida, index); out_put: put_disk(gd); out_free: kfree(sdkp); out: scsi_autopm_put_device(sdp); return error; } /** * sd_remove - called whenever a scsi disk (previously recognized by * sd_probe) is detached from the system. It is called (potentially * multiple times) during sd module unload. * @dev: pointer to device object * * Note: this function is invoked from the scsi mid-level. * This function potentially frees up a device name (e.g. /dev/sdc) * that could be re-used by a subsequent sd_probe(). * This function is not called when the built-in sd driver is "exit-ed". **/ static int sd_remove(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); scsi_autopm_get_device(sdkp->device); device_del(&sdkp->disk_dev); del_gendisk(sdkp->disk); if (!sdkp->suspended) sd_shutdown(dev); put_disk(sdkp->disk); return 0; } static void scsi_disk_release(struct device *dev) { struct scsi_disk *sdkp = to_scsi_disk(dev); ida_free(&sd_index_ida, sdkp->index); put_device(&sdkp->device->sdev_gendev); free_opal_dev(sdkp->opal_dev); kfree(sdkp); } static int sd_start_stop_device(struct scsi_disk *sdkp, int start) { unsigned char cmd[6] = { START_STOP }; /* START_VALID */ struct scsi_sense_hdr sshdr; struct scsi_failure failure_defs[] = { { /* Power on, reset, or bus device reset occurred */ .sense = UNIT_ATTENTION, .asc = 0x29, .ascq = 0, .result = SAM_STAT_CHECK_CONDITION, }, { /* Power on occurred */ .sense = UNIT_ATTENTION, .asc = 0x29, .ascq = 1, .result = SAM_STAT_CHECK_CONDITION, }, { /* SCSI bus reset */ .sense = UNIT_ATTENTION, .asc = 0x29, .ascq = 2, .result = SAM_STAT_CHECK_CONDITION, }, {} }; struct scsi_failures failures = { .total_allowed = 3, .failure_definitions = failure_defs, }; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, .req_flags = BLK_MQ_REQ_PM, .failures = &failures, }; struct scsi_device *sdp = sdkp->device; int res; if (start) cmd[4] |= 1; /* START */ if (sdp->start_stop_pwr_cond) cmd[4] |= start ? 1 << 4 : 3 << 4; /* Active or Standby */ if (!scsi_device_online(sdp)) return -ENODEV; res = scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, SD_TIMEOUT, sdkp->max_retries, &exec_args); if (res) { sd_print_result(sdkp, "Start/Stop Unit failed", res); if (res > 0 && scsi_sense_valid(&sshdr)) { sd_print_sense_hdr(sdkp, &sshdr); /* 0x3a is medium not present */ if (sshdr.asc == 0x3a) res = 0; } } /* SCSI error codes must not go to the generic layer */ if (res) return -EIO; return 0; } /* * Send a SYNCHRONIZE CACHE instruction down to the device through * the normal SCSI command structure. Wait for the command to * complete. */ static void sd_shutdown(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); if (!sdkp) return; /* this can happen */ if (pm_runtime_suspended(dev)) return; if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); sd_sync_cache(sdkp); } if ((system_state != SYSTEM_RESTART && sdkp->device->manage_system_start_stop) || (system_state == SYSTEM_POWER_OFF && sdkp->device->manage_shutdown)) { sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); sd_start_stop_device(sdkp, 0); } } static inline bool sd_do_start_stop(struct scsi_device *sdev, bool runtime) { return (sdev->manage_system_start_stop && !runtime) || (sdev->manage_runtime_start_stop && runtime); } static int sd_suspend_common(struct device *dev, bool runtime) { struct scsi_disk *sdkp = dev_get_drvdata(dev); int ret = 0; if (!sdkp) /* E.g.: runtime suspend following sd_remove() */ return 0; if (sdkp->WCE && sdkp->media_present) { if (!sdkp->device->silence_suspend) sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); ret = sd_sync_cache(sdkp); /* ignore OFFLINE device */ if (ret == -ENODEV) return 0; if (ret) return ret; } if (sd_do_start_stop(sdkp->device, runtime)) { if (!sdkp->device->silence_suspend) sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); /* an error is not worth aborting a system sleep */ ret = sd_start_stop_device(sdkp, 0); if (!runtime) ret = 0; } if (!ret) sdkp->suspended = true; return ret; } static int sd_suspend_system(struct device *dev) { if (pm_runtime_suspended(dev)) return 0; return sd_suspend_common(dev, false); } static int sd_suspend_runtime(struct device *dev) { return sd_suspend_common(dev, true); } static int sd_resume(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); if (opal_unlock_from_suspend(sdkp->opal_dev)) { sd_printk(KERN_NOTICE, sdkp, "OPAL unlock failed\n"); return -EIO; } return 0; } static int sd_resume_common(struct device *dev, bool runtime) { struct scsi_disk *sdkp = dev_get_drvdata(dev); int ret; if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; if (!sd_do_start_stop(sdkp->device, runtime)) { sdkp->suspended = false; return 0; } sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); ret = sd_start_stop_device(sdkp, 1); if (!ret) { sd_resume(dev); sdkp->suspended = false; } return ret; } static int sd_resume_system(struct device *dev) { if (pm_runtime_suspended(dev)) { struct scsi_disk *sdkp = dev_get_drvdata(dev); struct scsi_device *sdp = sdkp ? sdkp->device : NULL; if (sdp && sdp->force_runtime_start_on_system_start) pm_request_resume(dev); return 0; } return sd_resume_common(dev, false); } static int sd_resume_runtime(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); struct scsi_device *sdp; if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; sdp = sdkp->device; if (sdp->ignore_media_change) { /* clear the device's sense data */ static const u8 cmd[10] = { REQUEST_SENSE }; const struct scsi_exec_args exec_args = { .req_flags = BLK_MQ_REQ_PM, }; if (scsi_execute_cmd(sdp, cmd, REQ_OP_DRV_IN, NULL, 0, sdp->request_queue->rq_timeout, 1, &exec_args)) sd_printk(KERN_NOTICE, sdkp, "Failed to clear sense data\n"); } return sd_resume_common(dev, true); } static const struct dev_pm_ops sd_pm_ops = { .suspend = sd_suspend_system, .resume = sd_resume_system, .poweroff = sd_suspend_system, .restore = sd_resume_system, .runtime_suspend = sd_suspend_runtime, .runtime_resume = sd_resume_runtime, }; static struct scsi_driver sd_template = { .gendrv = { .name = "sd", .probe = sd_probe, .probe_type = PROBE_PREFER_ASYNCHRONOUS, .remove = sd_remove, .shutdown = sd_shutdown, .pm = &sd_pm_ops, }, .rescan = sd_rescan, .resume = sd_resume, .init_command = sd_init_command, .uninit_command = sd_uninit_command, .done = sd_done, .eh_action = sd_eh_action, .eh_reset = sd_eh_reset, }; /** * init_sd - entry point for this driver (both when built in or when * a module). * * Note: this function registers this driver with the scsi mid-level. **/ static int __init init_sd(void) { int majors = 0, i, err; SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n")); for (i = 0; i < SD_MAJORS; i++) { if (__register_blkdev(sd_major(i), "sd", sd_default_probe)) continue; majors++; } if (!majors) return -ENODEV; err = class_register(&sd_disk_class); if (err) goto err_out; sd_page_pool = mempool_create_page_pool(SD_MEMPOOL_SIZE, 0); if (!sd_page_pool) { printk(KERN_ERR "sd: can't init discard page pool\n"); err = -ENOMEM; goto err_out_class; } err = scsi_register_driver(&sd_template.gendrv); if (err) goto err_out_driver; return 0; err_out_driver: mempool_destroy(sd_page_pool); err_out_class: class_unregister(&sd_disk_class); err_out: for (i = 0; i < SD_MAJORS; i++) unregister_blkdev(sd_major(i), "sd"); return err; } /** * exit_sd - exit point for this driver (when it is a module). * * Note: this function unregisters this driver from the scsi mid-level. **/ static void __exit exit_sd(void) { int i; SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n")); scsi_unregister_driver(&sd_template.gendrv); mempool_destroy(sd_page_pool); class_unregister(&sd_disk_class); for (i = 0; i < SD_MAJORS; i++) unregister_blkdev(sd_major(i), "sd"); } module_init(init_sd); module_exit(exit_sd); void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { scsi_print_sense_hdr(sdkp->device, sdkp->disk ? sdkp->disk->disk_name : NULL, sshdr); } void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result) { const char *hb_string = scsi_hostbyte_string(result); if (hb_string) sd_printk(KERN_INFO, sdkp, "%s: Result: hostbyte=%s driverbyte=%s\n", msg, hb_string ? hb_string : "invalid", "DRIVER_OK"); else sd_printk(KERN_INFO, sdkp, "%s: Result: hostbyte=0x%02x driverbyte=%s\n", msg, host_byte(result), "DRIVER_OK"); } |
4 4 4 4 4 4 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 | /* * Copyright (c) 2014 Chelsio, Inc. All rights reserved. * Copyright (c) 2014 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "iwpm_util.h" #define IWPM_MAPINFO_HASH_SIZE 512 #define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1) #define IWPM_REMINFO_HASH_SIZE 64 #define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1) #define IWPM_MSG_SIZE 512 static LIST_HEAD(iwpm_nlmsg_req_list); static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock); static struct hlist_head *iwpm_hash_bucket; static DEFINE_SPINLOCK(iwpm_mapinfo_lock); static struct hlist_head *iwpm_reminfo_bucket; static DEFINE_SPINLOCK(iwpm_reminfo_lock); static struct iwpm_admin_data iwpm_admin; /** * iwpm_init - Allocate resources for the iwarp port mapper * @nl_client: The index of the netlink client * * Should be called when network interface goes up. */ int iwpm_init(u8 nl_client) { iwpm_hash_bucket = kcalloc(IWPM_MAPINFO_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!iwpm_hash_bucket) return -ENOMEM; iwpm_reminfo_bucket = kcalloc(IWPM_REMINFO_HASH_SIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!iwpm_reminfo_bucket) { kfree(iwpm_hash_bucket); return -ENOMEM; } iwpm_set_registration(nl_client, IWPM_REG_UNDEF); pr_debug("%s: Mapinfo and reminfo tables are created\n", __func__); return 0; } static void free_hash_bucket(void); static void free_reminfo_bucket(void); /** * iwpm_exit - Deallocate resources for the iwarp port mapper * @nl_client: The index of the netlink client * * Should be called when network interface goes down. */ int iwpm_exit(u8 nl_client) { free_hash_bucket(); free_reminfo_bucket(); pr_debug("%s: Resources are destroyed\n", __func__); iwpm_set_registration(nl_client, IWPM_REG_UNDEF); return 0; } static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); /** * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address * info in a hash table * @local_sockaddr: Local ip/tcp address * @mapped_sockaddr: Mapped local ip/tcp address * @nl_client: The index of the netlink client * @map_flags: IWPM mapping flags */ int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr, u8 nl_client, u32 map_flags) { struct hlist_head *hash_bucket_head = NULL; struct iwpm_mapping_info *map_info; unsigned long flags; int ret = -EINVAL; map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL); if (!map_info) return -ENOMEM; memcpy(&map_info->local_sockaddr, local_sockaddr, sizeof(struct sockaddr_storage)); memcpy(&map_info->mapped_sockaddr, mapped_sockaddr, sizeof(struct sockaddr_storage)); map_info->nl_client = nl_client; map_info->map_flags = map_flags; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { hash_bucket_head = get_mapinfo_hash_bucket( &map_info->local_sockaddr, &map_info->mapped_sockaddr); if (hash_bucket_head) { hlist_add_head(&map_info->hlist_node, hash_bucket_head); ret = 0; } } spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); if (!hash_bucket_head) kfree(map_info); return ret; } /** * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address * info from the hash table * @local_sockaddr: Local ip/tcp address * @mapped_local_addr: Mapped local ip/tcp address * * Returns err code if mapping info is not found in the hash table, * otherwise returns 0 */ int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_local_addr) { struct hlist_node *tmp_hlist_node; struct hlist_head *hash_bucket_head; struct iwpm_mapping_info *map_info = NULL; unsigned long flags; int ret = -EINVAL; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { hash_bucket_head = get_mapinfo_hash_bucket( local_sockaddr, mapped_local_addr); if (!hash_bucket_head) goto remove_mapinfo_exit; hlist_for_each_entry_safe(map_info, tmp_hlist_node, hash_bucket_head, hlist_node) { if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr, mapped_local_addr)) { hlist_del_init(&map_info->hlist_node); kfree(map_info); ret = 0; break; } } } remove_mapinfo_exit: spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return ret; } static void free_hash_bucket(void) { struct hlist_node *tmp_hlist_node; struct iwpm_mapping_info *map_info; unsigned long flags; int i; /* remove all the mapinfo data from the list */ spin_lock_irqsave(&iwpm_mapinfo_lock, flags); for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { hlist_for_each_entry_safe(map_info, tmp_hlist_node, &iwpm_hash_bucket[i], hlist_node) { hlist_del_init(&map_info->hlist_node); kfree(map_info); } } /* free the hash list */ kfree(iwpm_hash_bucket); iwpm_hash_bucket = NULL; spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); } static void free_reminfo_bucket(void) { struct hlist_node *tmp_hlist_node; struct iwpm_remote_info *rem_info; unsigned long flags; int i; /* remove all the remote info from the list */ spin_lock_irqsave(&iwpm_reminfo_lock, flags); for (i = 0; i < IWPM_REMINFO_HASH_SIZE; i++) { hlist_for_each_entry_safe(rem_info, tmp_hlist_node, &iwpm_reminfo_bucket[i], hlist_node) { hlist_del_init(&rem_info->hlist_node); kfree(rem_info); } } /* free the hash list */ kfree(iwpm_reminfo_bucket); iwpm_reminfo_bucket = NULL; spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); } static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *, struct sockaddr_storage *); void iwpm_add_remote_info(struct iwpm_remote_info *rem_info) { struct hlist_head *hash_bucket_head; unsigned long flags; spin_lock_irqsave(&iwpm_reminfo_lock, flags); if (iwpm_reminfo_bucket) { hash_bucket_head = get_reminfo_hash_bucket( &rem_info->mapped_loc_sockaddr, &rem_info->mapped_rem_sockaddr); if (hash_bucket_head) hlist_add_head(&rem_info->hlist_node, hash_bucket_head); } spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); } /** * iwpm_get_remote_info - Get the remote connecting peer address info * * @mapped_loc_addr: Mapped local address of the listening peer * @mapped_rem_addr: Mapped remote address of the connecting peer * @remote_addr: To store the remote address of the connecting peer * @nl_client: The index of the netlink client * * The remote address info is retrieved and provided to the client in * the remote_addr. After that it is removed from the hash table */ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, struct sockaddr_storage *mapped_rem_addr, struct sockaddr_storage *remote_addr, u8 nl_client) { struct hlist_node *tmp_hlist_node; struct hlist_head *hash_bucket_head; struct iwpm_remote_info *rem_info = NULL; unsigned long flags; int ret = -EINVAL; spin_lock_irqsave(&iwpm_reminfo_lock, flags); if (iwpm_reminfo_bucket) { hash_bucket_head = get_reminfo_hash_bucket( mapped_loc_addr, mapped_rem_addr); if (!hash_bucket_head) goto get_remote_info_exit; hlist_for_each_entry_safe(rem_info, tmp_hlist_node, hash_bucket_head, hlist_node) { if (!iwpm_compare_sockaddr(&rem_info->mapped_loc_sockaddr, mapped_loc_addr) && !iwpm_compare_sockaddr(&rem_info->mapped_rem_sockaddr, mapped_rem_addr)) { memcpy(remote_addr, &rem_info->remote_sockaddr, sizeof(struct sockaddr_storage)); iwpm_print_sockaddr(remote_addr, "get_remote_info: Remote sockaddr:"); hlist_del_init(&rem_info->hlist_node); kfree(rem_info); ret = 0; break; } } } get_remote_info_exit: spin_unlock_irqrestore(&iwpm_reminfo_lock, flags); return ret; } struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq, u8 nl_client, gfp_t gfp) { struct iwpm_nlmsg_request *nlmsg_request; unsigned long flags; nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp); if (!nlmsg_request) return NULL; spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list); spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); kref_init(&nlmsg_request->kref); kref_get(&nlmsg_request->kref); nlmsg_request->nlmsg_seq = nlmsg_seq; nlmsg_request->nl_client = nl_client; nlmsg_request->request_done = 0; nlmsg_request->err_code = 0; sema_init(&nlmsg_request->sem, 1); down(&nlmsg_request->sem); return nlmsg_request; } void iwpm_free_nlmsg_request(struct kref *kref) { struct iwpm_nlmsg_request *nlmsg_request; unsigned long flags; nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref); spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); list_del_init(&nlmsg_request->inprocess_list); spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); if (!nlmsg_request->request_done) pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n", __func__, nlmsg_request->nlmsg_seq); kfree(nlmsg_request); } struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq) { struct iwpm_nlmsg_request *nlmsg_request; struct iwpm_nlmsg_request *found_request = NULL; unsigned long flags; spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags); list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list, inprocess_list) { if (nlmsg_request->nlmsg_seq == echo_seq) { found_request = nlmsg_request; kref_get(&nlmsg_request->kref); break; } } spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags); return found_request; } int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request) { int ret; ret = down_timeout(&nlmsg_request->sem, IWPM_NL_TIMEOUT); if (ret) { ret = -EINVAL; pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n", __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq); } else { ret = nlmsg_request->err_code; } kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request); return ret; } int iwpm_get_nlmsg_seq(void) { return atomic_inc_return(&iwpm_admin.nlmsg_seq); } /* valid client */ u32 iwpm_get_registration(u8 nl_client) { return iwpm_admin.reg_list[nl_client]; } /* valid client */ void iwpm_set_registration(u8 nl_client, u32 reg) { iwpm_admin.reg_list[nl_client] = reg; } /* valid client */ u32 iwpm_check_registration(u8 nl_client, u32 reg) { return (iwpm_get_registration(nl_client) & reg); } int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr, struct sockaddr_storage *b_sockaddr) { if (a_sockaddr->ss_family != b_sockaddr->ss_family) return 1; if (a_sockaddr->ss_family == AF_INET) { struct sockaddr_in *a4_sockaddr = (struct sockaddr_in *)a_sockaddr; struct sockaddr_in *b4_sockaddr = (struct sockaddr_in *)b_sockaddr; if (!memcmp(&a4_sockaddr->sin_addr, &b4_sockaddr->sin_addr, sizeof(struct in_addr)) && a4_sockaddr->sin_port == b4_sockaddr->sin_port) return 0; } else if (a_sockaddr->ss_family == AF_INET6) { struct sockaddr_in6 *a6_sockaddr = (struct sockaddr_in6 *)a_sockaddr; struct sockaddr_in6 *b6_sockaddr = (struct sockaddr_in6 *)b_sockaddr; if (!memcmp(&a6_sockaddr->sin6_addr, &b6_sockaddr->sin6_addr, sizeof(struct in6_addr)) && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port) return 0; } else { pr_err("%s: Invalid sockaddr family\n", __func__); } return 1; } struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh, int nl_client) { struct sk_buff *skb = NULL; skb = dev_alloc_skb(IWPM_MSG_SIZE); if (!skb) goto create_nlmsg_exit; if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op, NLM_F_REQUEST))) { pr_warn("%s: Unable to put the nlmsg header\n", __func__); dev_kfree_skb(skb); skb = NULL; } create_nlmsg_exit: return skb; } int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max, const struct nla_policy *nlmsg_policy, struct nlattr *nltb[], const char *msg_type) { int nlh_len = 0; int ret; const char *err_str = ""; ret = nlmsg_validate_deprecated(cb->nlh, nlh_len, policy_max - 1, nlmsg_policy, NULL); if (ret) { err_str = "Invalid attribute"; goto parse_nlmsg_error; } ret = nlmsg_parse_deprecated(cb->nlh, nlh_len, nltb, policy_max - 1, nlmsg_policy, NULL); if (ret) { err_str = "Unable to parse the nlmsg"; goto parse_nlmsg_error; } ret = iwpm_validate_nlmsg_attr(nltb, policy_max); if (ret) { err_str = "Invalid NULL attribute"; goto parse_nlmsg_error; } return 0; parse_nlmsg_error: pr_warn("%s: %s (msg type %s ret = %d)\n", __func__, err_str, msg_type, ret); return ret; } void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg) { struct sockaddr_in6 *sockaddr_v6; struct sockaddr_in *sockaddr_v4; switch (sockaddr->ss_family) { case AF_INET: sockaddr_v4 = (struct sockaddr_in *)sockaddr; pr_debug("%s IPV4 %pI4: %u(0x%04X)\n", msg, &sockaddr_v4->sin_addr, ntohs(sockaddr_v4->sin_port), ntohs(sockaddr_v4->sin_port)); break; case AF_INET6: sockaddr_v6 = (struct sockaddr_in6 *)sockaddr; pr_debug("%s IPV6 %pI6: %u(0x%04X)\n", msg, &sockaddr_v6->sin6_addr, ntohs(sockaddr_v6->sin6_port), ntohs(sockaddr_v6->sin6_port)); break; default: break; } } static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr) { u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0); u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0); return hash; } static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr) { u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0); u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0); return hash; } static int get_hash_bucket(struct sockaddr_storage *a_sockaddr, struct sockaddr_storage *b_sockaddr, u32 *hash) { u32 a_hash, b_hash; if (a_sockaddr->ss_family == AF_INET) { a_hash = iwpm_ipv4_jhash((struct sockaddr_in *) a_sockaddr); b_hash = iwpm_ipv4_jhash((struct sockaddr_in *) b_sockaddr); } else if (a_sockaddr->ss_family == AF_INET6) { a_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) a_sockaddr); b_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) b_sockaddr); } else { pr_err("%s: Invalid sockaddr family\n", __func__); return -EINVAL; } if (a_hash == b_hash) /* if port mapper isn't available */ *hash = a_hash; else *hash = jhash_2words(a_hash, b_hash, 0); return 0; } static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *local_sockaddr, struct sockaddr_storage *mapped_sockaddr) { u32 hash; int ret; ret = get_hash_bucket(local_sockaddr, mapped_sockaddr, &hash); if (ret) return NULL; return &iwpm_hash_bucket[hash & IWPM_MAPINFO_HASH_MASK]; } static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *mapped_loc_sockaddr, struct sockaddr_storage *mapped_rem_sockaddr) { u32 hash; int ret; ret = get_hash_bucket(mapped_loc_sockaddr, mapped_rem_sockaddr, &hash); if (ret) return NULL; return &iwpm_reminfo_bucket[hash & IWPM_REMINFO_HASH_MASK]; } static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; u32 msg_seq; const char *err_str = ""; int ret = -EINVAL; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; goto mapinfo_num_error; } nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); msg_seq = 0; err_str = "Unable to put attribute of mapinfo number nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ); if (ret) goto mapinfo_num_error; ret = ibnl_put_attr(skb, nlh, sizeof(u32), &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); if (ret) goto mapinfo_num_error; nlmsg_end(skb, nlh); ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; goto mapinfo_num_error; } pr_debug("%s: Sent mapping number = %u\n", __func__, mapping_num); return 0; mapinfo_num_error: pr_info("%s: %s\n", __func__, err_str); dev_kfree_skb(skb); return ret; } static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) { struct nlmsghdr *nlh = NULL; int ret = 0; if (!skb) return ret; if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { pr_warn("%s Unable to put NLMSG_DONE\n", __func__); dev_kfree_skb(skb); return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) pr_warn("%s Unable to send a nlmsg\n", __func__); return ret; } int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) { struct iwpm_mapping_info *map_info; struct sk_buff *skb = NULL; struct nlmsghdr *nlh; int skb_num = 0, mapping_num = 0; int i = 0, nlmsg_bytes = 0; unsigned long flags; const char *err_str = ""; int ret; skb = dev_alloc_skb(NLMSG_GOODSIZE); if (!skb) { ret = -ENOMEM; err_str = "Unable to allocate skb"; goto send_mapping_info_exit; } skb_num++; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); ret = -EINVAL; for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], hlist_node) { if (map_info->nl_client != nl_client) continue; nlh = NULL; if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { ret = -ENOMEM; err_str = "Unable to put the nlmsg header"; goto send_mapping_info_unlock; } err_str = "Unable to put attribute of the nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), &map_info->local_sockaddr, IWPM_NLA_MAPINFO_LOCAL_ADDR); if (ret) goto send_mapping_info_unlock; ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage), &map_info->mapped_sockaddr, IWPM_NLA_MAPINFO_MAPPED_ADDR); if (ret) goto send_mapping_info_unlock; if (iwpm_ulib_version > IWPM_UABI_VERSION_MIN) { ret = ibnl_put_attr(skb, nlh, sizeof(u32), &map_info->map_flags, IWPM_NLA_MAPINFO_FLAGS); if (ret) goto send_mapping_info_unlock; } nlmsg_end(skb, nlh); iwpm_print_sockaddr(&map_info->local_sockaddr, "send_mapping_info: Local sockaddr:"); iwpm_print_sockaddr(&map_info->mapped_sockaddr, "send_mapping_info: Mapped local sockaddr:"); mapping_num++; nlmsg_bytes += nlh->nlmsg_len; /* check if all mappings can fit in one skb */ if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) { /* and leave room for NLMSG_DONE */ nlmsg_bytes = 0; skb_num++; spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); /* send the skb */ ret = send_nlmsg_done(skb, nl_client, iwpm_pid); skb = NULL; if (ret) { err_str = "Unable to send map info"; goto send_mapping_info_exit; } if (skb_num == IWPM_MAPINFO_SKB_COUNT) { ret = -ENOMEM; err_str = "Insufficient skbs for map info"; goto send_mapping_info_exit; } skb = dev_alloc_skb(NLMSG_GOODSIZE); if (!skb) { ret = -ENOMEM; err_str = "Unable to allocate skb"; goto send_mapping_info_exit; } spin_lock_irqsave(&iwpm_mapinfo_lock, flags); } } } send_mapping_info_unlock: spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); send_mapping_info_exit: if (ret) { pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret); dev_kfree_skb(skb); return ret; } send_nlmsg_done(skb, nl_client, iwpm_pid); return send_mapinfo_num(mapping_num, nl_client, iwpm_pid); } int iwpm_mapinfo_available(void) { unsigned long flags; int full_bucket = 0, i = 0; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); if (iwpm_hash_bucket) { for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { if (!hlist_empty(&iwpm_hash_bucket[i])) { full_bucket = 1; break; } } } spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags); return full_bucket; } int iwpm_send_hello(u8 nl_client, int iwpm_pid, u16 abi_version) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; const char *err_str; int ret = -EINVAL; skb = iwpm_create_nlmsg(RDMA_NL_IWPM_HELLO, &nlh, nl_client); if (!skb) { err_str = "Unable to create a nlmsg"; goto hello_num_error; } nlh->nlmsg_seq = iwpm_get_nlmsg_seq(); err_str = "Unable to put attribute of abi_version into nlmsg"; ret = ibnl_put_attr(skb, nlh, sizeof(u16), &abi_version, IWPM_NLA_HELLO_ABI_VERSION); if (ret) goto hello_num_error; nlmsg_end(skb, nlh); ret = rdma_nl_unicast(&init_net, skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; goto hello_num_error; } pr_debug("%s: Sent hello abi_version = %u\n", __func__, abi_version); return 0; hello_num_error: pr_info("%s: %s\n", __func__, err_str); dev_kfree_skb(skb); return ret; } |
7 3 3 3 3 3 3 3 3 3 3 3 3 6 1 1 4 1 3 2 1 1 3 3 3 3 3 3 3 3 5 6 13 3 1 1 1 1 1 1 1 3 3 3 3 2 2 2 2 2 2 2 6 6 6 3 3 3 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 | // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_qfq.c Quick Fair Queueing Plus Scheduler. * * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente. * Copyright (c) 2012 Paolo Valente. */ #include <linux/module.h> #include <linux/init.h> #include <linux/bitops.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> /* Quick Fair Queueing Plus ======================== Sources: [1] Paolo Valente, "Reducing the Execution Time of Fair-Queueing Schedulers." http://algo.ing.unimo.it/people/paolo/agg-sched/agg-sched.pdf Sources for QFQ: [2] Fabio Checconi, Luigi Rizzo, and Paolo Valente: "QFQ: Efficient Packet Scheduling with Tight Bandwidth Distribution Guarantees." See also: http://retis.sssup.it/~fabio/linux/qfq/ */ /* QFQ+ divides classes into aggregates of at most MAX_AGG_CLASSES classes. Each aggregate is timestamped with a virtual start time S and a virtual finish time F, and scheduled according to its timestamps. S and F are computed as a function of a system virtual time function V. The classes within each aggregate are instead scheduled with DRR. To speed up operations, QFQ+ divides also aggregates into a limited number of groups. Which group a class belongs to depends on the ratio between the maximum packet length for the class and the weight of the class. Groups have their own S and F. In the end, QFQ+ schedules groups, then aggregates within groups, then classes within aggregates. See [1] and [2] for a full description. Virtual time computations. S, F and V are all computed in fixed point arithmetic with FRAC_BITS decimal bits. QFQ_MAX_INDEX is the maximum index allowed for a group. We need one bit per index. QFQ_MAX_WSHIFT is the maximum power of two supported as a weight. The layout of the bits is as below: [ MTU_SHIFT ][ FRAC_BITS ] [ MAX_INDEX ][ MIN_SLOT_SHIFT ] ^.__grp->index = 0 *.__grp->slot_shift where MIN_SLOT_SHIFT is derived by difference from the others. The max group index corresponds to Lmax/w_min, where Lmax=1<<MTU_SHIFT, w_min = 1 . From this, and knowing how many groups (MAX_INDEX) we want, we can derive the shift corresponding to each group. Because we often need to compute F = S + len/w_i and V = V + len/wsum instead of storing w_i store the value inv_w = (1<<FRAC_BITS)/w_i so we can do F = S + len * inv_w * wsum. We use W_TOT in the formulas so we can easily move between static and adaptive weight sum. The per-scheduler-instance data contain all the data structures for the scheduler: bitmaps and bucket lists. */ /* * Maximum number of consecutive slots occupied by backlogged classes * inside a group. */ #define QFQ_MAX_SLOTS 32 /* * Shifts used for aggregate<->group mapping. We allow class weights that are * in the range [1, 2^MAX_WSHIFT], and we try to map each aggregate i to the * group with the smallest index that can support the L_i / r_i configured * for the classes in the aggregate. * * grp->index is the index of the group; and grp->slot_shift * is the shift for the corresponding (scaled) sigma_i. */ #define QFQ_MAX_INDEX 24 #define QFQ_MAX_WSHIFT 10 #define QFQ_MAX_WEIGHT (1<<QFQ_MAX_WSHIFT) /* see qfq_slot_insert */ #define QFQ_MAX_WSUM (64*QFQ_MAX_WEIGHT) #define FRAC_BITS 30 /* fixed point arithmetic */ #define ONE_FP (1UL << FRAC_BITS) #define QFQ_MTU_SHIFT 16 /* to support TSO/GSO */ #define QFQ_MIN_LMAX 512 /* see qfq_slot_insert */ #define QFQ_MAX_LMAX (1UL << QFQ_MTU_SHIFT) #define QFQ_MAX_AGG_CLASSES 8 /* max num classes per aggregate allowed */ /* * Possible group states. These values are used as indexes for the bitmaps * array of struct qfq_queue. */ enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE }; struct qfq_group; struct qfq_aggregate; struct qfq_class { struct Qdisc_class_common common; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct Qdisc *qdisc; struct list_head alist; /* Link for active-classes list. */ struct qfq_aggregate *agg; /* Parent aggregate. */ int deficit; /* DRR deficit counter. */ }; struct qfq_aggregate { struct hlist_node next; /* Link for the slot list. */ u64 S, F; /* flow timestamps (exact) */ /* group we belong to. In principle we would need the index, * which is log_2(lmax/weight), but we never reference it * directly, only the group. */ struct qfq_group *grp; /* these are copied from the flowset. */ u32 class_weight; /* Weight of each class in this aggregate. */ /* Max pkt size for the classes in this aggregate, DRR quantum. */ int lmax; u32 inv_w; /* ONE_FP/(sum of weights of classes in aggr.). */ u32 budgetmax; /* Max budget for this aggregate. */ u32 initial_budget, budget; /* Initial and current budget. */ int num_classes; /* Number of classes in this aggr. */ struct list_head active; /* DRR queue of active classes. */ struct hlist_node nonfull_next; /* See nonfull_aggs in qfq_sched. */ }; struct qfq_group { u64 S, F; /* group timestamps (approx). */ unsigned int slot_shift; /* Slot shift. */ unsigned int index; /* Group index. */ unsigned int front; /* Index of the front slot. */ unsigned long full_slots; /* non-empty slots */ /* Array of RR lists of active aggregates. */ struct hlist_head slots[QFQ_MAX_SLOTS]; }; struct qfq_sched { struct tcf_proto __rcu *filter_list; struct tcf_block *block; struct Qdisc_class_hash clhash; u64 oldV, V; /* Precise virtual times. */ struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ u32 wsum; /* weight sum */ u32 iwsum; /* inverse weight sum */ unsigned long bitmaps[QFQ_MAX_STATE]; /* Group bitmaps. */ struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */ u32 min_slot_shift; /* Index of the group-0 bit in the bitmaps. */ u32 max_agg_classes; /* Max number of classes per aggr. */ struct hlist_head nonfull_aggs; /* Aggs with room for more classes. */ }; /* * Possible reasons why the timestamps of an aggregate are updated * enqueue: the aggregate switches from idle to active and must scheduled * for service * requeue: the aggregate finishes its budget, so it stops being served and * must be rescheduled for service */ enum update_reason {enqueue, requeue}; static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) { struct qfq_sched *q = qdisc_priv(sch); struct Qdisc_class_common *clc; clc = qdisc_class_find(&q->clhash, classid); if (clc == NULL) return NULL; return container_of(clc, struct qfq_class, common); } static const struct netlink_range_validation lmax_range = { .min = QFQ_MIN_LMAX, .max = QFQ_MAX_LMAX, }; static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = { [TCA_QFQ_WEIGHT] = NLA_POLICY_RANGE(NLA_U32, 1, QFQ_MAX_WEIGHT), [TCA_QFQ_LMAX] = NLA_POLICY_FULL_RANGE(NLA_U32, &lmax_range), }; /* * Calculate a flow index, given its weight and maximum packet length. * index = log_2(maxlen/weight) but we need to apply the scaling. * This is used only once at flow creation. */ static int qfq_calc_index(u32 inv_w, unsigned int maxlen, u32 min_slot_shift) { u64 slot_size = (u64)maxlen * inv_w; unsigned long size_map; int index = 0; size_map = slot_size >> min_slot_shift; if (!size_map) goto out; index = __fls(size_map) + 1; /* basically a log_2 */ index -= !(slot_size - (1ULL << (index + min_slot_shift - 1))); if (index < 0) index = 0; out: pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n", (unsigned long) ONE_FP/inv_w, maxlen, index); return index; } static void qfq_deactivate_agg(struct qfq_sched *, struct qfq_aggregate *); static void qfq_activate_agg(struct qfq_sched *, struct qfq_aggregate *, enum update_reason); static void qfq_init_agg(struct qfq_sched *q, struct qfq_aggregate *agg, u32 lmax, u32 weight) { INIT_LIST_HEAD(&agg->active); hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); agg->lmax = lmax; agg->class_weight = weight; } static struct qfq_aggregate *qfq_find_agg(struct qfq_sched *q, u32 lmax, u32 weight) { struct qfq_aggregate *agg; hlist_for_each_entry(agg, &q->nonfull_aggs, nonfull_next) if (agg->lmax == lmax && agg->class_weight == weight) return agg; return NULL; } /* Update aggregate as a function of the new number of classes. */ static void qfq_update_agg(struct qfq_sched *q, struct qfq_aggregate *agg, int new_num_classes) { u32 new_agg_weight; if (new_num_classes == q->max_agg_classes) hlist_del_init(&agg->nonfull_next); if (agg->num_classes > new_num_classes && new_num_classes == q->max_agg_classes - 1) /* agg no more full */ hlist_add_head(&agg->nonfull_next, &q->nonfull_aggs); /* The next assignment may let * agg->initial_budget > agg->budgetmax * hold, we will take it into account in charge_actual_service(). */ agg->budgetmax = new_num_classes * agg->lmax; new_agg_weight = agg->class_weight * new_num_classes; agg->inv_w = ONE_FP/new_agg_weight; if (agg->grp == NULL) { int i = qfq_calc_index(agg->inv_w, agg->budgetmax, q->min_slot_shift); agg->grp = &q->groups[i]; } q->wsum += (int) agg->class_weight * (new_num_classes - agg->num_classes); q->iwsum = ONE_FP / q->wsum; agg->num_classes = new_num_classes; } /* Add class to aggregate. */ static void qfq_add_to_agg(struct qfq_sched *q, struct qfq_aggregate *agg, struct qfq_class *cl) { cl->agg = agg; qfq_update_agg(q, agg, agg->num_classes+1); if (cl->qdisc->q.qlen > 0) { /* adding an active class */ list_add_tail(&cl->alist, &agg->active); if (list_first_entry(&agg->active, struct qfq_class, alist) == cl && q->in_serv_agg != agg) /* agg was inactive */ qfq_activate_agg(q, agg, enqueue); /* schedule agg */ } } static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *); static void qfq_destroy_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { hlist_del_init(&agg->nonfull_next); q->wsum -= agg->class_weight; if (q->wsum != 0) q->iwsum = ONE_FP / q->wsum; if (q->in_serv_agg == agg) q->in_serv_agg = qfq_choose_next_agg(q); kfree(agg); } /* Deschedule class from within its parent aggregate. */ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) { struct qfq_aggregate *agg = cl->agg; list_del(&cl->alist); /* remove from RR queue of the aggregate */ if (list_empty(&agg->active)) /* agg is now inactive */ qfq_deactivate_agg(q, agg); } /* Remove class from its parent aggregate. */ static void qfq_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) { struct qfq_aggregate *agg = cl->agg; cl->agg = NULL; if (agg->num_classes == 1) { /* agg being emptied, destroy it */ qfq_destroy_agg(q, agg); return; } qfq_update_agg(q, agg, agg->num_classes-1); } /* Deschedule class and remove it from its parent aggregate. */ static void qfq_deact_rm_from_agg(struct qfq_sched *q, struct qfq_class *cl) { if (cl->qdisc->q.qlen > 0) /* class is active */ qfq_deactivate_class(q, cl); qfq_rm_from_agg(q, cl); } /* Move class to a new aggregate, matching the new class weight and/or lmax */ static int qfq_change_agg(struct Qdisc *sch, struct qfq_class *cl, u32 weight, u32 lmax) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_aggregate *new_agg; /* 'lmax' can range from [QFQ_MIN_LMAX, pktlen + stab overhead] */ if (lmax > QFQ_MAX_LMAX) return -EINVAL; new_agg = qfq_find_agg(q, lmax, weight); if (new_agg == NULL) { /* create new aggregate */ new_agg = kzalloc(sizeof(*new_agg), GFP_ATOMIC); if (new_agg == NULL) return -ENOBUFS; qfq_init_agg(q, new_agg, lmax, weight); } qfq_deact_rm_from_agg(q, cl); qfq_add_to_agg(q, new_agg, cl); return 0; } static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, unsigned long *arg, struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)*arg; bool existing = false; struct nlattr *tb[TCA_QFQ_MAX + 1]; struct qfq_aggregate *new_agg = NULL; u32 weight, lmax, inv_w; int err; int delta_w; if (NL_REQ_ATTR_CHECK(extack, NULL, tca, TCA_OPTIONS)) { NL_SET_ERR_MSG_MOD(extack, "missing options"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy, extack); if (err < 0) return err; if (tb[TCA_QFQ_WEIGHT]) weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]); else weight = 1; if (tb[TCA_QFQ_LMAX]) { lmax = nla_get_u32(tb[TCA_QFQ_LMAX]); } else { /* MTU size is user controlled */ lmax = psched_mtu(qdisc_dev(sch)); if (lmax < QFQ_MIN_LMAX || lmax > QFQ_MAX_LMAX) { NL_SET_ERR_MSG_MOD(extack, "MTU size out of bounds for qfq"); return -EINVAL; } } inv_w = ONE_FP / weight; weight = ONE_FP / inv_w; if (cl != NULL && lmax == cl->agg->lmax && weight == cl->agg->class_weight) return 0; /* nothing to change */ delta_w = weight - (cl ? cl->agg->class_weight : 0); if (q->wsum + delta_w > QFQ_MAX_WSUM) { NL_SET_ERR_MSG_FMT_MOD(extack, "total weight out of range (%d + %u)\n", delta_w, q->wsum); return -EINVAL; } if (cl != NULL) { /* modify existing class */ if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, true, tca[TCA_RATE]); if (err) return err; } existing = true; goto set_change_agg; } /* create and init new class */ cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL); if (cl == NULL) return -ENOBUFS; gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->deficit = lmax; cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, NULL); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, true, tca[TCA_RATE]); if (err) goto destroy_class; } if (cl->qdisc != &noop_qdisc) qdisc_hash_add(cl->qdisc, true); set_change_agg: sch_tree_lock(sch); new_agg = qfq_find_agg(q, lmax, weight); if (new_agg == NULL) { /* create new aggregate */ sch_tree_unlock(sch); new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL); if (new_agg == NULL) { err = -ENOBUFS; gen_kill_estimator(&cl->rate_est); goto destroy_class; } sch_tree_lock(sch); qfq_init_agg(q, new_agg, lmax, weight); } if (existing) qfq_deact_rm_from_agg(q, cl); else qdisc_class_hash_insert(&q->clhash, &cl->common); qfq_add_to_agg(q, new_agg, cl); sch_tree_unlock(sch); qdisc_class_hash_grow(sch, &q->clhash); *arg = (unsigned long)cl; return 0; destroy_class: qdisc_put(cl->qdisc); kfree(cl); return err; } static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl) { struct qfq_sched *q = qdisc_priv(sch); qfq_rm_from_agg(q, cl); gen_kill_estimator(&cl->rate_est); qdisc_put(cl->qdisc); kfree(cl); } static int qfq_delete_class(struct Qdisc *sch, unsigned long arg, struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; if (qdisc_class_in_use(&cl->common)) { NL_SET_ERR_MSG_MOD(extack, "QFQ class in use"); return -EBUSY; } sch_tree_lock(sch); qdisc_purge_queue(cl->qdisc); qdisc_class_hash_remove(&q->clhash, &cl->common); sch_tree_unlock(sch); qfq_destroy_class(sch, cl); return 0; } static unsigned long qfq_search_class(struct Qdisc *sch, u32 classid) { return (unsigned long)qfq_find_class(sch, classid); } static struct tcf_block *qfq_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) { struct qfq_class *cl = qfq_find_class(sch, classid); if (cl) qdisc_class_get(&cl->common); return (unsigned long)cl; } static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg) { struct qfq_class *cl = (struct qfq_class *)arg; qdisc_class_put(&cl->common); } static int qfq_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct qfq_class *cl = (struct qfq_class *)arg; if (new == NULL) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, cl->common.classid, NULL); if (new == NULL) new = &noop_qdisc; } *old = qdisc_replace(sch, new, &cl->qdisc); return 0; } static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg) { struct qfq_class *cl = (struct qfq_class *)arg; return cl->qdisc; } static int qfq_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { struct qfq_class *cl = (struct qfq_class *)arg; struct nlattr *nest; tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle = cl->common.classid; tcm->tcm_info = cl->qdisc->handle; nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_QFQ_WEIGHT, cl->agg->class_weight) || nla_put_u32(skb, TCA_QFQ_LMAX, cl->agg->lmax)) goto nla_put_failure; return nla_nest_end(skb, nest); nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct qfq_class *cl = (struct qfq_class *)arg; struct tc_qfq_stats xstats; memset(&xstats, 0, sizeof(xstats)); xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || qdisc_qstats_copy(d, cl->qdisc) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; unsigned int i; if (arg->stop) return; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) return; } } } static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; struct tcf_result res; struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) { pr_debug("qfq_classify: found %d\n", skb->priority); cl = qfq_find_class(sch, skb->priority); if (cl != NULL) return cl; } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: case TC_ACT_STOLEN: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif cl = (struct qfq_class *)res.class; if (cl == NULL) cl = qfq_find_class(sch, res.classid); return cl; } return NULL; } /* Generic comparison function, handling wraparound. */ static inline int qfq_gt(u64 a, u64 b) { return (s64)(a - b) > 0; } /* Round a precise timestamp to its slotted value. */ static inline u64 qfq_round_down(u64 ts, unsigned int shift) { return ts & ~((1ULL << shift) - 1); } /* return the pointer to the group with lowest index in the bitmap */ static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, unsigned long bitmap) { int index = __ffs(bitmap); return &q->groups[index]; } /* Calculate a mask to mimic what would be ffs_from(). */ static inline unsigned long mask_from(unsigned long bitmap, int from) { return bitmap & ~((1UL << from) - 1); } /* * The state computation relies on ER=0, IR=1, EB=2, IB=3 * First compute eligibility comparing grp->S, q->V, * then check if someone is blocking us and possibly add EB */ static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp) { /* if S > V we are not eligible */ unsigned int state = qfq_gt(grp->S, q->V); unsigned long mask = mask_from(q->bitmaps[ER], grp->index); struct qfq_group *next; if (mask) { next = qfq_ffs(q, mask); if (qfq_gt(grp->F, next->F)) state |= EB; } return state; } /* * In principle * q->bitmaps[dst] |= q->bitmaps[src] & mask; * q->bitmaps[src] &= ~mask; * but we should make sure that src != dst */ static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) { q->bitmaps[dst] |= q->bitmaps[src] & mask; q->bitmaps[src] &= ~mask; } static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F) { unsigned long mask = mask_from(q->bitmaps[ER], index + 1); struct qfq_group *next; if (mask) { next = qfq_ffs(q, mask); if (!qfq_gt(next->F, old_F)) return; } mask = (1UL << index) - 1; qfq_move_groups(q, mask, EB, ER); qfq_move_groups(q, mask, IB, IR); } /* * perhaps * old_V ^= q->V; old_V >>= q->min_slot_shift; if (old_V) { ... } * */ static void qfq_make_eligible(struct qfq_sched *q) { unsigned long vslot = q->V >> q->min_slot_shift; unsigned long old_vslot = q->oldV >> q->min_slot_shift; if (vslot != old_vslot) { unsigned long mask; int last_flip_pos = fls(vslot ^ old_vslot); if (last_flip_pos > 31) /* higher than the number of groups */ mask = ~0UL; /* make all groups eligible */ else mask = (1UL << last_flip_pos) - 1; qfq_move_groups(q, mask, IR, ER); qfq_move_groups(q, mask, IB, EB); } } /* * The index of the slot in which the input aggregate agg is to be * inserted must not be higher than QFQ_MAX_SLOTS-2. There is a '-2' * and not a '-1' because the start time of the group may be moved * backward by one slot after the aggregate has been inserted, and * this would cause non-empty slots to be right-shifted by one * position. * * QFQ+ fully satisfies this bound to the slot index if the parameters * of the classes are not changed dynamically, and if QFQ+ never * happens to postpone the service of agg unjustly, i.e., it never * happens that the aggregate becomes backlogged and eligible, or just * eligible, while an aggregate with a higher approximated finish time * is being served. In particular, in this case QFQ+ guarantees that * the timestamps of agg are low enough that the slot index is never * higher than 2. Unfortunately, QFQ+ cannot provide the same * guarantee if it happens to unjustly postpone the service of agg, or * if the parameters of some class are changed. * * As for the first event, i.e., an out-of-order service, the * upper bound to the slot index guaranteed by QFQ+ grows to * 2 + * QFQ_MAX_AGG_CLASSES * ((1<<QFQ_MTU_SHIFT)/QFQ_MIN_LMAX) * * (current_max_weight/current_wsum) <= 2 + 8 * 128 * 1. * * The following function deals with this problem by backward-shifting * the timestamps of agg, if needed, so as to guarantee that the slot * index is never higher than QFQ_MAX_SLOTS-2. This backward-shift may * cause the service of other aggregates to be postponed, yet the * worst-case guarantees of these aggregates are not violated. In * fact, in case of no out-of-order service, the timestamps of agg * would have been even lower than they are after the backward shift, * because QFQ+ would have guaranteed a maximum value equal to 2 for * the slot index, and 2 < QFQ_MAX_SLOTS-2. Hence the aggregates whose * service is postponed because of the backward-shift would have * however waited for the service of agg before being served. * * The other event that may cause the slot index to be higher than 2 * for agg is a recent change of the parameters of some class. If the * weight of a class is increased or the lmax (max_pkt_size) of the * class is decreased, then a new aggregate with smaller slot size * than the original parent aggregate of the class may happen to be * activated. The activation of this aggregate should be properly * delayed to when the service of the class has finished in the ideal * system tracked by QFQ+. If the activation of the aggregate is not * delayed to this reference time instant, then this aggregate may be * unjustly served before other aggregates waiting for service. This * may cause the above bound to the slot index to be violated for some * of these unlucky aggregates. * * Instead of delaying the activation of the new aggregate, which is * quite complex, the above-discussed capping of the slot index is * used to handle also the consequences of a change of the parameters * of a class. */ static void qfq_slot_insert(struct qfq_group *grp, struct qfq_aggregate *agg, u64 roundedS) { u64 slot = (roundedS - grp->S) >> grp->slot_shift; unsigned int i; /* slot index in the bucket list */ if (unlikely(slot > QFQ_MAX_SLOTS - 2)) { u64 deltaS = roundedS - grp->S - ((u64)(QFQ_MAX_SLOTS - 2)<<grp->slot_shift); agg->S -= deltaS; agg->F -= deltaS; slot = QFQ_MAX_SLOTS - 2; } i = (grp->front + slot) % QFQ_MAX_SLOTS; hlist_add_head(&agg->next, &grp->slots[i]); __set_bit(slot, &grp->full_slots); } /* Maybe introduce hlist_first_entry?? */ static struct qfq_aggregate *qfq_slot_head(struct qfq_group *grp) { return hlist_entry(grp->slots[grp->front].first, struct qfq_aggregate, next); } /* * remove the entry from the slot */ static void qfq_front_slot_remove(struct qfq_group *grp) { struct qfq_aggregate *agg = qfq_slot_head(grp); BUG_ON(!agg); hlist_del(&agg->next); if (hlist_empty(&grp->slots[grp->front])) __clear_bit(0, &grp->full_slots); } /* * Returns the first aggregate in the first non-empty bucket of the * group. As a side effect, adjusts the bucket list so the first * non-empty bucket is at position 0 in full_slots. */ static struct qfq_aggregate *qfq_slot_scan(struct qfq_group *grp) { unsigned int i; pr_debug("qfq slot_scan: grp %u full %#lx\n", grp->index, grp->full_slots); if (grp->full_slots == 0) return NULL; i = __ffs(grp->full_slots); /* zero based */ if (i > 0) { grp->front = (grp->front + i) % QFQ_MAX_SLOTS; grp->full_slots >>= i; } return qfq_slot_head(grp); } /* * adjust the bucket list. When the start time of a group decreases, * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to * move the objects. The mask of occupied slots must be shifted * because we use ffs() to find the first non-empty slot. * This covers decreases in the group's start time, but what about * increases of the start time ? * Here too we should make sure that i is less than 32 */ static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS) { unsigned int i = (grp->S - roundedS) >> grp->slot_shift; grp->full_slots <<= i; grp->front = (grp->front - i) % QFQ_MAX_SLOTS; } static void qfq_update_eligible(struct qfq_sched *q) { struct qfq_group *grp; unsigned long ineligible; ineligible = q->bitmaps[IR] | q->bitmaps[IB]; if (ineligible) { if (!q->bitmaps[ER]) { grp = qfq_ffs(q, ineligible); if (qfq_gt(grp->S, q->V)) q->V = grp->S; } qfq_make_eligible(q); } } /* Dequeue head packet of the head class in the DRR queue of the aggregate. */ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, struct qfq_class *cl, unsigned int len) { struct sk_buff *skb = qdisc_dequeue_peeked(cl->qdisc); if (!skb) return NULL; cl->deficit -= (int) len; if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ list_del(&cl->alist); else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { cl->deficit += agg->lmax; list_move_tail(&cl->alist, &agg->active); } return skb; } static inline struct sk_buff *qfq_peek_skb(struct qfq_aggregate *agg, struct qfq_class **cl, unsigned int *len) { struct sk_buff *skb; *cl = list_first_entry(&agg->active, struct qfq_class, alist); skb = (*cl)->qdisc->ops->peek((*cl)->qdisc); if (skb == NULL) qdisc_warn_nonwc("qfq_dequeue", (*cl)->qdisc); else *len = qdisc_pkt_len(skb); return skb; } /* Update F according to the actual service received by the aggregate. */ static inline void charge_actual_service(struct qfq_aggregate *agg) { /* Compute the service received by the aggregate, taking into * account that, after decreasing the number of classes in * agg, it may happen that * agg->initial_budget - agg->budget > agg->bugdetmax */ u32 service_received = min(agg->budgetmax, agg->initial_budget - agg->budget); agg->F = agg->S + (u64)service_received * agg->inv_w; } /* Assign a reasonable start time for a new aggregate in group i. * Admissible values for \hat(F) are multiples of \sigma_i * no greater than V+\sigma_i . Larger values mean that * we had a wraparound so we consider the timestamp to be stale. * * If F is not stale and F >= V then we set S = F. * Otherwise we should assign S = V, but this may violate * the ordering in EB (see [2]). So, if we have groups in ER, * set S to the F_j of the first group j which would be blocking us. * We are guaranteed not to move S backward because * otherwise our group i would still be blocked. */ static void qfq_update_start(struct qfq_sched *q, struct qfq_aggregate *agg) { unsigned long mask; u64 limit, roundedF; int slot_shift = agg->grp->slot_shift; roundedF = qfq_round_down(agg->F, slot_shift); limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift); if (!qfq_gt(agg->F, q->V) || qfq_gt(roundedF, limit)) { /* timestamp was stale */ mask = mask_from(q->bitmaps[ER], agg->grp->index); if (mask) { struct qfq_group *next = qfq_ffs(q, mask); if (qfq_gt(roundedF, next->F)) { if (qfq_gt(limit, next->F)) agg->S = next->F; else /* preserve timestamp correctness */ agg->S = limit; return; } } agg->S = q->V; } else /* timestamp is not stale */ agg->S = agg->F; } /* Update the timestamps of agg before scheduling/rescheduling it for * service. In particular, assign to agg->F its maximum possible * value, i.e., the virtual finish time with which the aggregate * should be labeled if it used all its budget once in service. */ static inline void qfq_update_agg_ts(struct qfq_sched *q, struct qfq_aggregate *agg, enum update_reason reason) { if (reason != requeue) qfq_update_start(q, agg); else /* just charge agg for the service received */ agg->S = agg->F; agg->F = agg->S + (u64)agg->budgetmax * agg->inv_w; } static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg); static struct sk_buff *qfq_dequeue(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_aggregate *in_serv_agg = q->in_serv_agg; struct qfq_class *cl; struct sk_buff *skb = NULL; /* next-packet len, 0 means no more active classes in in-service agg */ unsigned int len = 0; if (in_serv_agg == NULL) return NULL; if (!list_empty(&in_serv_agg->active)) skb = qfq_peek_skb(in_serv_agg, &cl, &len); /* * If there are no active classes in the in-service aggregate, * or if the aggregate has not enough budget to serve its next * class, then choose the next aggregate to serve. */ if (len == 0 || in_serv_agg->budget < len) { charge_actual_service(in_serv_agg); /* recharge the budget of the aggregate */ in_serv_agg->initial_budget = in_serv_agg->budget = in_serv_agg->budgetmax; if (!list_empty(&in_serv_agg->active)) { /* * Still active: reschedule for * service. Possible optimization: if no other * aggregate is active, then there is no point * in rescheduling this aggregate, and we can * just keep it as the in-service one. This * should be however a corner case, and to * handle it, we would need to maintain an * extra num_active_aggs field. */ qfq_update_agg_ts(q, in_serv_agg, requeue); qfq_schedule_agg(q, in_serv_agg); } else if (sch->q.qlen == 0) { /* no aggregate to serve */ q->in_serv_agg = NULL; return NULL; } /* * If we get here, there are other aggregates queued: * choose the new aggregate to serve. */ in_serv_agg = q->in_serv_agg = qfq_choose_next_agg(q); skb = qfq_peek_skb(in_serv_agg, &cl, &len); } if (!skb) return NULL; sch->q.qlen--; skb = agg_dequeue(in_serv_agg, cl, len); if (!skb) { sch->q.qlen++; return NULL; } qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); /* If lmax is lowered, through qfq_change_class, for a class * owning pending packets with larger size than the new value * of lmax, then the following condition may hold. */ if (unlikely(in_serv_agg->budget < len)) in_serv_agg->budget = 0; else in_serv_agg->budget -= len; q->V += (u64)len * q->iwsum; pr_debug("qfq dequeue: len %u F %lld now %lld\n", len, (unsigned long long) in_serv_agg->F, (unsigned long long) q->V); return skb; } static struct qfq_aggregate *qfq_choose_next_agg(struct qfq_sched *q) { struct qfq_group *grp; struct qfq_aggregate *agg, *new_front_agg; u64 old_F; qfq_update_eligible(q); q->oldV = q->V; if (!q->bitmaps[ER]) return NULL; grp = qfq_ffs(q, q->bitmaps[ER]); old_F = grp->F; agg = qfq_slot_head(grp); /* agg starts to be served, remove it from schedule */ qfq_front_slot_remove(grp); new_front_agg = qfq_slot_scan(grp); if (new_front_agg == NULL) /* group is now inactive, remove from ER */ __clear_bit(grp->index, &q->bitmaps[ER]); else { u64 roundedS = qfq_round_down(new_front_agg->S, grp->slot_shift); unsigned int s; if (grp->S == roundedS) return agg; grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); __clear_bit(grp->index, &q->bitmaps[ER]); s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); } qfq_unblock_groups(q, grp->index, old_F); return agg; } static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb), gso_segs; struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; struct qfq_aggregate *agg; int err = 0; bool first; cl = qfq_classify(skb, sch, &err); if (cl == NULL) { if (err & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return err; } pr_debug("qfq_enqueue: cl = %x\n", cl->common.classid); if (unlikely(cl->agg->lmax < len)) { pr_debug("qfq: increasing maxpkt from %u to %u for class %u", cl->agg->lmax, len, cl->common.classid); err = qfq_change_agg(sch, cl, cl->agg->class_weight, len); if (err) { cl->qstats.drops++; return qdisc_drop(skb, sch, to_free); } } gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); if (net_xmit_drop_count(err)) { cl->qstats.drops++; qdisc_qstats_drop(sch); } return err; } _bstats_update(&cl->bstats, len, gso_segs); sch->qstats.backlog += len; ++sch->q.qlen; agg = cl->agg; /* if the queue was not empty, then done here */ if (!first) { if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && list_first_entry(&agg->active, struct qfq_class, alist) == cl && cl->deficit < len) list_move_tail(&cl->alist, &agg->active); return err; } /* schedule class for service within the aggregate */ cl->deficit = agg->lmax; list_add_tail(&cl->alist, &agg->active); if (list_first_entry(&agg->active, struct qfq_class, alist) != cl || q->in_serv_agg == agg) return err; /* non-empty or in service, nothing else to do */ qfq_activate_agg(q, agg, enqueue); return err; } /* * Schedule aggregate according to its timestamps. */ static void qfq_schedule_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { struct qfq_group *grp = agg->grp; u64 roundedS; int s; roundedS = qfq_round_down(agg->S, grp->slot_shift); /* * Insert agg in the correct bucket. * If agg->S >= grp->S we don't need to adjust the * bucket list and simply go to the insertion phase. * Otherwise grp->S is decreasing, we must make room * in the bucket list, and also recompute the group state. * Finally, if there were no flows in this group and nobody * was in ER make sure to adjust V. */ if (grp->full_slots) { if (!qfq_gt(grp->S, agg->S)) goto skip_update; /* create a slot for this agg->S */ qfq_slot_rotate(grp, roundedS); /* group was surely ineligible, remove */ __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[IB]); } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V) && q->in_serv_agg == NULL) q->V = roundedS; grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n", s, q->bitmaps[s], (unsigned long long) agg->S, (unsigned long long) agg->F, (unsigned long long) q->V); skip_update: qfq_slot_insert(grp, agg, roundedS); } /* Update agg ts and schedule agg for service */ static void qfq_activate_agg(struct qfq_sched *q, struct qfq_aggregate *agg, enum update_reason reason) { agg->initial_budget = agg->budget = agg->budgetmax; /* recharge budg. */ qfq_update_agg_ts(q, agg, reason); if (q->in_serv_agg == NULL) { /* no aggr. in service or scheduled */ q->in_serv_agg = agg; /* start serving this aggregate */ /* update V: to be in service, agg must be eligible */ q->oldV = q->V = agg->S; } else if (agg != q->in_serv_agg) qfq_schedule_agg(q, agg); } static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, struct qfq_aggregate *agg) { unsigned int i, offset; u64 roundedS; roundedS = qfq_round_down(agg->S, grp->slot_shift); offset = (roundedS - grp->S) >> grp->slot_shift; i = (grp->front + offset) % QFQ_MAX_SLOTS; hlist_del(&agg->next); if (hlist_empty(&grp->slots[i])) __clear_bit(offset, &grp->full_slots); } /* * Called to forcibly deschedule an aggregate. If the aggregate is * not in the front bucket, or if the latter has other aggregates in * the front bucket, we can simply remove the aggregate with no other * side effects. * Otherwise we must propagate the event up. */ static void qfq_deactivate_agg(struct qfq_sched *q, struct qfq_aggregate *agg) { struct qfq_group *grp = agg->grp; unsigned long mask; u64 roundedS; int s; if (agg == q->in_serv_agg) { charge_actual_service(agg); q->in_serv_agg = qfq_choose_next_agg(q); return; } agg->F = agg->S; qfq_slot_remove(q, grp, agg); if (!grp->full_slots) { __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[EB]); __clear_bit(grp->index, &q->bitmaps[IB]); if (test_bit(grp->index, &q->bitmaps[ER]) && !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); if (mask) mask = ~((1UL << __fls(mask)) - 1); else mask = ~0UL; qfq_move_groups(q, mask, EB, ER); qfq_move_groups(q, mask, IB, IR); } __clear_bit(grp->index, &q->bitmaps[ER]); } else if (hlist_empty(&grp->slots[grp->front])) { agg = qfq_slot_scan(grp); roundedS = qfq_round_down(agg->S, grp->slot_shift); if (grp->S != roundedS) { __clear_bit(grp->index, &q->bitmaps[ER]); __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[EB]); __clear_bit(grp->index, &q->bitmaps[IB]); grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); } } } static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; qfq_deactivate_class(q, cl); } static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_group *grp; int i, j, err; u32 max_cl_shift, maxbudg_shift, max_classes; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; err = qdisc_class_hash_init(&q->clhash); if (err < 0) return err; max_classes = min_t(u64, (u64)qdisc_dev(sch)->tx_queue_len + 1, QFQ_MAX_AGG_CLASSES); /* max_cl_shift = floor(log_2(max_classes)) */ max_cl_shift = __fls(max_classes); q->max_agg_classes = 1<<max_cl_shift; /* maxbudg_shift = log2(max_len * max_classes_per_agg) */ maxbudg_shift = QFQ_MTU_SHIFT + max_cl_shift; q->min_slot_shift = FRAC_BITS + maxbudg_shift - QFQ_MAX_INDEX; for (i = 0; i <= QFQ_MAX_INDEX; i++) { grp = &q->groups[i]; grp->index = i; grp->slot_shift = q->min_slot_shift + i; for (j = 0; j < QFQ_MAX_SLOTS; j++) INIT_HLIST_HEAD(&grp->slots[j]); } INIT_HLIST_HEAD(&q->nonfull_aggs); return 0; } static void qfq_reset_qdisc(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; unsigned int i; for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen > 0) qfq_deactivate_class(q, cl); qdisc_reset(cl->qdisc); } } } static void qfq_destroy_qdisc(struct Qdisc *sch) { struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl; struct hlist_node *next; unsigned int i; tcf_block_put(q->block); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i], common.hnode) { qfq_destroy_class(sch, cl); } } qdisc_class_hash_destroy(&q->clhash); } static const struct Qdisc_class_ops qfq_class_ops = { .change = qfq_change_class, .delete = qfq_delete_class, .find = qfq_search_class, .tcf_block = qfq_tcf_block, .bind_tcf = qfq_bind_tcf, .unbind_tcf = qfq_unbind_tcf, .graft = qfq_graft_class, .leaf = qfq_class_leaf, .qlen_notify = qfq_qlen_notify, .dump = qfq_dump_class, .dump_stats = qfq_dump_class_stats, .walk = qfq_walk, }; static struct Qdisc_ops qfq_qdisc_ops __read_mostly = { .cl_ops = &qfq_class_ops, .id = "qfq", .priv_size = sizeof(struct qfq_sched), .enqueue = qfq_enqueue, .dequeue = qfq_dequeue, .peek = qdisc_peek_dequeued, .init = qfq_init_qdisc, .reset = qfq_reset_qdisc, .destroy = qfq_destroy_qdisc, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("qfq"); static int __init qfq_init(void) { return register_qdisc(&qfq_qdisc_ops); } static void __exit qfq_exit(void) { unregister_qdisc(&qfq_qdisc_ops); } module_init(qfq_init); module_exit(qfq_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Quick Fair Queueing Plus qdisc"); |
34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | /* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/xattr.h Extended attributes handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #ifndef _LINUX_XATTR_H #define _LINUX_XATTR_H #include <linux/slab.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/user_namespace.h> #include <uapi/linux/xattr.h> struct inode; struct dentry; static inline bool is_posix_acl_xattr(const char *name) { return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); } /* * struct xattr_handler: When @name is set, match attributes with exactly that * name. When @prefix is set instead, match attributes with that prefix and * with a non-empty suffix. */ struct xattr_handler { const char *name; const char *prefix; int flags; /* fs private flags */ bool (*list)(struct dentry *dentry); int (*get)(const struct xattr_handler *, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size); int (*set)(const struct xattr_handler *, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags); }; /** * xattr_handler_can_list - check whether xattr can be listed * @handler: handler for this type of xattr * @dentry: dentry whose inode xattr to list * * Determine whether the xattr associated with @dentry can be listed given * @handler. * * Return: true if xattr can be listed, false if not. */ static inline bool xattr_handler_can_list(const struct xattr_handler *handler, struct dentry *dentry) { return handler && (!handler->list || handler->list(dentry)); } const char *xattr_full_name(const struct xattr_handler *, const char *); struct xattr { const char *name; void *value; size_t value_len; }; ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t); ssize_t vfs_getxattr(struct mnt_idmap *, struct dentry *, const char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); int __vfs_setxattr(struct mnt_idmap *, struct dentry *, struct inode *, const char *, const void *, size_t, int); int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int, struct inode **); int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *, const char *, struct inode **); int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t size, gfp_t flags); int xattr_supports_user_prefix(struct inode *inode); static inline const char *xattr_prefix(const struct xattr_handler *handler) { return handler->prefix ?: handler->name; } struct simple_xattrs { struct rb_root rb_root; rwlock_t lock; }; struct simple_xattr { struct rb_node rb_node; char *name; size_t size; char value[]; }; void simple_xattrs_init(struct simple_xattrs *xattrs); void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); #endif /* _LINUX_XATTR_H */ |
42 35 30 36 37 34 36 35 37 3 3 3 3 1 3 3 3 3 3 3 3 7 4 6 2 1 2 1 1 6 6 5 4 1 16 14 1 1 2 2 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 | /* * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/slab.h> #include <linux/types.h> #include <linux/rbtree.h> #include <linux/bitops.h> #include <linux/export.h> #include "rds.h" /* * This file implements the receive side of the unconventional congestion * management in RDS. * * Messages waiting in the receive queue on the receiving socket are accounted * against the sockets SO_RCVBUF option value. Only the payload bytes in the * message are accounted for. If the number of bytes queued equals or exceeds * rcvbuf then the socket is congested. All sends attempted to this socket's * address should return block or return -EWOULDBLOCK. * * Applications are expected to be reasonably tuned such that this situation * very rarely occurs. An application encountering this "back-pressure" is * considered a bug. * * This is implemented by having each node maintain bitmaps which indicate * which ports on bound addresses are congested. As the bitmap changes it is * sent through all the connections which terminate in the local address of the * bitmap which changed. * * The bitmaps are allocated as connections are brought up. This avoids * allocation in the interrupt handling path which queues messages on sockets. * The dense bitmaps let transports send the entire bitmap on any bitmap change * reasonably efficiently. This is much easier to implement than some * finer-grained communication of per-port congestion. The sender does a very * inexpensive bit test to test if the port it's about to send to is congested * or not. */ /* * Interaction with poll is a tad tricky. We want all processes stuck in * poll to wake up and check whether a congested destination became uncongested. * The really sad thing is we have no idea which destinations the application * wants to send to - we don't even know which rds_connections are involved. * So until we implement a more flexible rds poll interface, we have to make * do with this: * We maintain a global counter that is incremented each time a congestion map * update is received. Each rds socket tracks this value, and if rds_poll * finds that the saved generation number is smaller than the global generation * number, it wakes up the process. */ static atomic_t rds_cong_generation = ATOMIC_INIT(0); /* * Congestion monitoring */ static LIST_HEAD(rds_cong_monitor); static DEFINE_RWLOCK(rds_cong_monitor_lock); /* * Yes, a global lock. It's used so infrequently that it's worth keeping it * global to simplify the locking. It's only used in the following * circumstances: * * - on connection buildup to associate a conn with its maps * - on map changes to inform conns of a new map to send * * It's sadly ordered under the socket callback lock and the connection lock. * Receive paths can mark ports congested from interrupt context so the * lock masks interrupts. */ static DEFINE_SPINLOCK(rds_cong_lock); static struct rb_root rds_cong_tree = RB_ROOT; static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr, struct rds_cong_map *insert) { struct rb_node **p = &rds_cong_tree.rb_node; struct rb_node *parent = NULL; struct rds_cong_map *map; while (*p) { int diff; parent = *p; map = rb_entry(parent, struct rds_cong_map, m_rb_node); diff = rds_addr_cmp(addr, &map->m_addr); if (diff < 0) p = &(*p)->rb_left; else if (diff > 0) p = &(*p)->rb_right; else return map; } if (insert) { rb_link_node(&insert->m_rb_node, parent, p); rb_insert_color(&insert->m_rb_node, &rds_cong_tree); } return NULL; } /* * There is only ever one bitmap for any address. Connections try and allocate * these bitmaps in the process getting pointers to them. The bitmaps are only * ever freed as the module is removed after all connections have been freed. */ static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr) { struct rds_cong_map *map; struct rds_cong_map *ret = NULL; unsigned long zp; unsigned long i; unsigned long flags; map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); if (!map) return NULL; map->m_addr = *addr; init_waitqueue_head(&map->m_waitq); INIT_LIST_HEAD(&map->m_conn_list); for (i = 0; i < RDS_CONG_MAP_PAGES; i++) { zp = get_zeroed_page(GFP_KERNEL); if (zp == 0) goto out; map->m_page_addrs[i] = zp; } spin_lock_irqsave(&rds_cong_lock, flags); ret = rds_cong_tree_walk(addr, map); spin_unlock_irqrestore(&rds_cong_lock, flags); if (!ret) { ret = map; map = NULL; } out: if (map) { for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) free_page(map->m_page_addrs[i]); kfree(map); } rdsdebug("map %p for addr %pI6c\n", ret, addr); return ret; } /* * Put the conn on its local map's list. This is called when the conn is * really added to the hash. It's nested under the rds_conn_lock, sadly. */ void rds_cong_add_conn(struct rds_connection *conn) { unsigned long flags; rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong); spin_lock_irqsave(&rds_cong_lock, flags); list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list); spin_unlock_irqrestore(&rds_cong_lock, flags); } void rds_cong_remove_conn(struct rds_connection *conn) { unsigned long flags; rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong); spin_lock_irqsave(&rds_cong_lock, flags); list_del_init(&conn->c_map_item); spin_unlock_irqrestore(&rds_cong_lock, flags); } int rds_cong_get_maps(struct rds_connection *conn) { conn->c_lcong = rds_cong_from_addr(&conn->c_laddr); conn->c_fcong = rds_cong_from_addr(&conn->c_faddr); if (!(conn->c_lcong && conn->c_fcong)) return -ENOMEM; return 0; } void rds_cong_queue_updates(struct rds_cong_map *map) { struct rds_connection *conn; unsigned long flags; spin_lock_irqsave(&rds_cong_lock, flags); list_for_each_entry(conn, &map->m_conn_list, c_map_item) { struct rds_conn_path *cp = &conn->c_path[0]; rcu_read_lock(); if (!test_and_set_bit(0, &conn->c_map_queued) && !rds_destroy_pending(cp->cp_conn)) { rds_stats_inc(s_cong_update_queued); /* We cannot inline the call to rds_send_xmit() here * for two reasons (both pertaining to a TCP transport): * 1. When we get here from the receive path, we * are already holding the sock_lock (held by * tcp_v4_rcv()). So inlining calls to * tcp_setsockopt and/or tcp_sendmsg will deadlock * when it tries to get the sock_lock()) * 2. Interrupts are masked so that we can mark the * port congested from both send and recv paths. * (See comment around declaration of rdc_cong_lock). * An attempt to get the sock_lock() here will * therefore trigger warnings. * Defer the xmit to rds_send_worker() instead. */ queue_delayed_work(rds_wq, &cp->cp_send_w, 0); } rcu_read_unlock(); } spin_unlock_irqrestore(&rds_cong_lock, flags); } void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) { rdsdebug("waking map %p for %pI4\n", map, &map->m_addr); rds_stats_inc(s_cong_update_received); atomic_inc(&rds_cong_generation); if (waitqueue_active(&map->m_waitq)) wake_up(&map->m_waitq); if (waitqueue_active(&rds_poll_waitq)) wake_up_all(&rds_poll_waitq); if (portmask && !list_empty(&rds_cong_monitor)) { unsigned long flags; struct rds_sock *rs; read_lock_irqsave(&rds_cong_monitor_lock, flags); list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) { spin_lock(&rs->rs_lock); rs->rs_cong_notify |= (rs->rs_cong_mask & portmask); rs->rs_cong_mask &= ~portmask; spin_unlock(&rs->rs_lock); if (rs->rs_cong_notify) rds_wake_sk_sleep(rs); } read_unlock_irqrestore(&rds_cong_monitor_lock, flags); } } EXPORT_SYMBOL_GPL(rds_cong_map_updated); int rds_cong_updated_since(unsigned long *recent) { unsigned long gen = atomic_read(&rds_cong_generation); if (likely(*recent == gen)) return 0; *recent = gen; return 1; } /* * We're called under the locking that protects the sockets receive buffer * consumption. This makes it a lot easier for the caller to only call us * when it knows that an existing set bit needs to be cleared, and vice versa. * We can't block and we need to deal with concurrent sockets working against * the same per-address map. */ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) { unsigned long i; unsigned long off; rdsdebug("setting congestion for %pI4:%u in map %p\n", &map->m_addr, ntohs(port), map); i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; set_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) { unsigned long i; unsigned long off; rdsdebug("clearing congestion for %pI4:%u in map %p\n", &map->m_addr, ntohs(port), map); i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; clear_bit_le(off, (void *)map->m_page_addrs[i]); } static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) { unsigned long i; unsigned long off; i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; return test_bit_le(off, (void *)map->m_page_addrs[i]); } void rds_cong_add_socket(struct rds_sock *rs) { unsigned long flags; write_lock_irqsave(&rds_cong_monitor_lock, flags); if (list_empty(&rs->rs_cong_list)) list_add(&rs->rs_cong_list, &rds_cong_monitor); write_unlock_irqrestore(&rds_cong_monitor_lock, flags); } void rds_cong_remove_socket(struct rds_sock *rs) { unsigned long flags; struct rds_cong_map *map; write_lock_irqsave(&rds_cong_monitor_lock, flags); list_del_init(&rs->rs_cong_list); write_unlock_irqrestore(&rds_cong_monitor_lock, flags); /* update congestion map for now-closed port */ spin_lock_irqsave(&rds_cong_lock, flags); map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL); spin_unlock_irqrestore(&rds_cong_lock, flags); if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { rds_cong_clear_bit(map, rs->rs_bound_port); rds_cong_queue_updates(map); } } int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs) { if (!rds_cong_test_bit(map, port)) return 0; if (nonblock) { if (rs && rs->rs_cong_monitor) { unsigned long flags; /* It would have been nice to have an atomic set_bit on * a uint64_t. */ spin_lock_irqsave(&rs->rs_lock, flags); rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port)); spin_unlock_irqrestore(&rs->rs_lock, flags); /* Test again - a congestion update may have arrived in * the meantime. */ if (!rds_cong_test_bit(map, port)) return 0; } rds_stats_inc(s_cong_send_error); return -ENOBUFS; } rds_stats_inc(s_cong_send_blocked); rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port)); return wait_event_interruptible(map->m_waitq, !rds_cong_test_bit(map, port)); } void rds_cong_exit(void) { struct rb_node *node; struct rds_cong_map *map; unsigned long i; while ((node = rb_first(&rds_cong_tree))) { map = rb_entry(node, struct rds_cong_map, m_rb_node); rdsdebug("freeing map %p\n", map); rb_erase(&map->m_rb_node, &rds_cong_tree); for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++) free_page(map->m_page_addrs[i]); kfree(map); } } /* * Allocate a RDS message containing a congestion update. */ struct rds_message *rds_cong_update_alloc(struct rds_connection *conn) { struct rds_cong_map *map = conn->c_lcong; struct rds_message *rm; rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES); if (!IS_ERR(rm)) rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP; return rm; } |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright Gavin Shan, IBM Corporation 2016. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/of.h> #include <linux/platform_device.h> #include <net/ncsi.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/genetlink.h> #include "internal.h" #include "ncsi-pkt.h" #include "ncsi-netlink.h" LIST_HEAD(ncsi_dev_list); DEFINE_SPINLOCK(ncsi_dev_lock); bool ncsi_channel_has_link(struct ncsi_channel *channel) { return !!(channel->modes[NCSI_MODE_LINK].data[2] & 0x1); } bool ncsi_channel_is_last(struct ncsi_dev_priv *ndp, struct ncsi_channel *channel) { struct ncsi_package *np; struct ncsi_channel *nc; NCSI_FOR_EACH_PACKAGE(ndp, np) NCSI_FOR_EACH_CHANNEL(np, nc) { if (nc == channel) continue; if (nc->state == NCSI_CHANNEL_ACTIVE && ncsi_channel_has_link(nc)) return false; } return true; } static void ncsi_report_link(struct ncsi_dev_priv *ndp, bool force_down) { struct ncsi_dev *nd = &ndp->ndev; struct ncsi_package *np; struct ncsi_channel *nc; unsigned long flags; nd->state = ncsi_dev_state_functional; if (force_down) { nd->link_up = 0; goto report; } nd->link_up = 0; NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { spin_lock_irqsave(&nc->lock, flags); if (!list_empty(&nc->link) || nc->state != NCSI_CHANNEL_ACTIVE) { spin_unlock_irqrestore(&nc->lock, flags); continue; } if (ncsi_channel_has_link(nc)) { spin_unlock_irqrestore(&nc->lock, flags); nd->link_up = 1; goto report; } spin_unlock_irqrestore(&nc->lock, flags); } } report: nd->handler(nd); } static void ncsi_channel_monitor(struct timer_list *t) { struct ncsi_channel *nc = from_timer(nc, t, monitor.timer); struct ncsi_package *np = nc->package; struct ncsi_dev_priv *ndp = np->ndp; struct ncsi_channel_mode *ncm; struct ncsi_cmd_arg nca; bool enabled, chained; unsigned int monitor_state; unsigned long flags; int state, ret; spin_lock_irqsave(&nc->lock, flags); state = nc->state; chained = !list_empty(&nc->link); enabled = nc->monitor.enabled; monitor_state = nc->monitor.state; spin_unlock_irqrestore(&nc->lock, flags); if (!enabled) return; /* expected race disabling timer */ if (WARN_ON_ONCE(chained)) goto bad_state; if (state != NCSI_CHANNEL_INACTIVE && state != NCSI_CHANNEL_ACTIVE) { bad_state: netdev_warn(ndp->ndev.dev, "Bad NCSI monitor state channel %d 0x%x %s queue\n", nc->id, state, chained ? "on" : "off"); spin_lock_irqsave(&nc->lock, flags); nc->monitor.enabled = false; spin_unlock_irqrestore(&nc->lock, flags); return; } switch (monitor_state) { case NCSI_CHANNEL_MONITOR_START: case NCSI_CHANNEL_MONITOR_RETRY: nca.ndp = ndp; nca.package = np->id; nca.channel = nc->id; nca.type = NCSI_PKT_CMD_GLS; nca.req_flags = 0; ret = ncsi_xmit_cmd(&nca); if (ret) netdev_err(ndp->ndev.dev, "Error %d sending GLS\n", ret); break; case NCSI_CHANNEL_MONITOR_WAIT ... NCSI_CHANNEL_MONITOR_WAIT_MAX: break; default: netdev_err(ndp->ndev.dev, "NCSI Channel %d timed out!\n", nc->id); ncsi_report_link(ndp, true); ndp->flags |= NCSI_DEV_RESHUFFLE; ncm = &nc->modes[NCSI_MODE_LINK]; spin_lock_irqsave(&nc->lock, flags); nc->monitor.enabled = false; nc->state = NCSI_CHANNEL_INVISIBLE; ncm->data[2] &= ~0x1; spin_unlock_irqrestore(&nc->lock, flags); spin_lock_irqsave(&ndp->lock, flags); nc->state = NCSI_CHANNEL_ACTIVE; list_add_tail_rcu(&nc->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); ncsi_process_next_channel(ndp); return; } spin_lock_irqsave(&nc->lock, flags); nc->monitor.state++; spin_unlock_irqrestore(&nc->lock, flags); mod_timer(&nc->monitor.timer, jiffies + HZ); } void ncsi_start_channel_monitor(struct ncsi_channel *nc) { unsigned long flags; spin_lock_irqsave(&nc->lock, flags); WARN_ON_ONCE(nc->monitor.enabled); nc->monitor.enabled = true; nc->monitor.state = NCSI_CHANNEL_MONITOR_START; spin_unlock_irqrestore(&nc->lock, flags); mod_timer(&nc->monitor.timer, jiffies + HZ); } void ncsi_stop_channel_monitor(struct ncsi_channel *nc) { unsigned long flags; spin_lock_irqsave(&nc->lock, flags); if (!nc->monitor.enabled) { spin_unlock_irqrestore(&nc->lock, flags); return; } nc->monitor.enabled = false; spin_unlock_irqrestore(&nc->lock, flags); del_timer_sync(&nc->monitor.timer); } struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, unsigned char id) { struct ncsi_channel *nc; NCSI_FOR_EACH_CHANNEL(np, nc) { if (nc->id == id) return nc; } return NULL; } struct ncsi_channel *ncsi_add_channel(struct ncsi_package *np, unsigned char id) { struct ncsi_channel *nc, *tmp; int index; unsigned long flags; nc = kzalloc(sizeof(*nc), GFP_ATOMIC); if (!nc) return NULL; nc->id = id; nc->package = np; nc->state = NCSI_CHANNEL_INACTIVE; nc->monitor.enabled = false; timer_setup(&nc->monitor.timer, ncsi_channel_monitor, 0); spin_lock_init(&nc->lock); INIT_LIST_HEAD(&nc->link); for (index = 0; index < NCSI_CAP_MAX; index++) nc->caps[index].index = index; for (index = 0; index < NCSI_MODE_MAX; index++) nc->modes[index].index = index; spin_lock_irqsave(&np->lock, flags); tmp = ncsi_find_channel(np, id); if (tmp) { spin_unlock_irqrestore(&np->lock, flags); kfree(nc); return tmp; } list_add_tail_rcu(&nc->node, &np->channels); np->channel_num++; spin_unlock_irqrestore(&np->lock, flags); return nc; } static void ncsi_remove_channel(struct ncsi_channel *nc) { struct ncsi_package *np = nc->package; unsigned long flags; spin_lock_irqsave(&nc->lock, flags); /* Release filters */ kfree(nc->mac_filter.addrs); kfree(nc->vlan_filter.vids); nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); ncsi_stop_channel_monitor(nc); /* Remove and free channel */ spin_lock_irqsave(&np->lock, flags); list_del_rcu(&nc->node); np->channel_num--; spin_unlock_irqrestore(&np->lock, flags); kfree(nc); } struct ncsi_package *ncsi_find_package(struct ncsi_dev_priv *ndp, unsigned char id) { struct ncsi_package *np; NCSI_FOR_EACH_PACKAGE(ndp, np) { if (np->id == id) return np; } return NULL; } struct ncsi_package *ncsi_add_package(struct ncsi_dev_priv *ndp, unsigned char id) { struct ncsi_package *np, *tmp; unsigned long flags; np = kzalloc(sizeof(*np), GFP_ATOMIC); if (!np) return NULL; np->id = id; np->ndp = ndp; spin_lock_init(&np->lock); INIT_LIST_HEAD(&np->channels); np->channel_whitelist = UINT_MAX; spin_lock_irqsave(&ndp->lock, flags); tmp = ncsi_find_package(ndp, id); if (tmp) { spin_unlock_irqrestore(&ndp->lock, flags); kfree(np); return tmp; } list_add_tail_rcu(&np->node, &ndp->packages); ndp->package_num++; spin_unlock_irqrestore(&ndp->lock, flags); return np; } void ncsi_remove_package(struct ncsi_package *np) { struct ncsi_dev_priv *ndp = np->ndp; struct ncsi_channel *nc, *tmp; unsigned long flags; /* Release all child channels */ list_for_each_entry_safe(nc, tmp, &np->channels, node) ncsi_remove_channel(nc); /* Remove and free package */ spin_lock_irqsave(&ndp->lock, flags); list_del_rcu(&np->node); ndp->package_num--; spin_unlock_irqrestore(&ndp->lock, flags); kfree(np); } void ncsi_find_package_and_channel(struct ncsi_dev_priv *ndp, unsigned char id, struct ncsi_package **np, struct ncsi_channel **nc) { struct ncsi_package *p; struct ncsi_channel *c; p = ncsi_find_package(ndp, NCSI_PACKAGE_INDEX(id)); c = p ? ncsi_find_channel(p, NCSI_CHANNEL_INDEX(id)) : NULL; if (np) *np = p; if (nc) *nc = c; } /* For two consecutive NCSI commands, the packet IDs shouldn't * be same. Otherwise, the bogus response might be replied. So * the available IDs are allocated in round-robin fashion. */ struct ncsi_request *ncsi_alloc_request(struct ncsi_dev_priv *ndp, unsigned int req_flags) { struct ncsi_request *nr = NULL; int i, limit = ARRAY_SIZE(ndp->requests); unsigned long flags; /* Check if there is one available request until the ceiling */ spin_lock_irqsave(&ndp->lock, flags); for (i = ndp->request_id; i < limit; i++) { if (ndp->requests[i].used) continue; nr = &ndp->requests[i]; nr->used = true; nr->flags = req_flags; ndp->request_id = i + 1; goto found; } /* Fail back to check from the starting cursor */ for (i = NCSI_REQ_START_IDX; i < ndp->request_id; i++) { if (ndp->requests[i].used) continue; nr = &ndp->requests[i]; nr->used = true; nr->flags = req_flags; ndp->request_id = i + 1; goto found; } found: spin_unlock_irqrestore(&ndp->lock, flags); return nr; } void ncsi_free_request(struct ncsi_request *nr) { struct ncsi_dev_priv *ndp = nr->ndp; struct sk_buff *cmd, *rsp; unsigned long flags; bool driven; if (nr->enabled) { nr->enabled = false; del_timer_sync(&nr->timer); } spin_lock_irqsave(&ndp->lock, flags); cmd = nr->cmd; rsp = nr->rsp; nr->cmd = NULL; nr->rsp = NULL; nr->used = false; driven = !!(nr->flags & NCSI_REQ_FLAG_EVENT_DRIVEN); spin_unlock_irqrestore(&ndp->lock, flags); if (driven && cmd && --ndp->pending_req_num == 0) schedule_work(&ndp->work); /* Release command and response */ consume_skb(cmd); consume_skb(rsp); } struct ncsi_dev *ncsi_find_dev(struct net_device *dev) { struct ncsi_dev_priv *ndp; NCSI_FOR_EACH_DEV(ndp) { if (ndp->ndev.dev == dev) return &ndp->ndev; } return NULL; } static void ncsi_request_timeout(struct timer_list *t) { struct ncsi_request *nr = from_timer(nr, t, timer); struct ncsi_dev_priv *ndp = nr->ndp; struct ncsi_cmd_pkt *cmd; struct ncsi_package *np; struct ncsi_channel *nc; unsigned long flags; /* If the request already had associated response, * let the response handler to release it. */ spin_lock_irqsave(&ndp->lock, flags); nr->enabled = false; if (nr->rsp || !nr->cmd) { spin_unlock_irqrestore(&ndp->lock, flags); return; } spin_unlock_irqrestore(&ndp->lock, flags); if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) { if (nr->cmd) { /* Find the package */ cmd = (struct ncsi_cmd_pkt *) skb_network_header(nr->cmd); ncsi_find_package_and_channel(ndp, cmd->cmd.common.channel, &np, &nc); ncsi_send_netlink_timeout(nr, np, nc); } } /* Release the request */ ncsi_free_request(nr); } static void ncsi_suspend_channel(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; struct ncsi_package *np; struct ncsi_channel *nc, *tmp; struct ncsi_cmd_arg nca; unsigned long flags; int ret; np = ndp->active_package; nc = ndp->active_channel; nca.ndp = ndp; nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN; switch (nd->state) { case ncsi_dev_state_suspend: nd->state = ncsi_dev_state_suspend_select; fallthrough; case ncsi_dev_state_suspend_select: ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_SP; nca.package = np->id; nca.channel = NCSI_RESERVED_CHANNEL; if (ndp->flags & NCSI_DEV_HWA) nca.bytes[0] = 0; else nca.bytes[0] = 1; /* To retrieve the last link states of channels in current * package when current active channel needs fail over to * another one. It means we will possibly select another * channel as next active one. The link states of channels * are most important factor of the selection. So we need * accurate link states. Unfortunately, the link states on * inactive channels can't be updated with LSC AEN in time. */ if (ndp->flags & NCSI_DEV_RESHUFFLE) nd->state = ncsi_dev_state_suspend_gls; else nd->state = ncsi_dev_state_suspend_dcnt; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; break; case ncsi_dev_state_suspend_gls: ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_GLS; nca.package = np->id; nca.channel = ndp->channel_probe_id; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; ndp->channel_probe_id++; if (ndp->channel_probe_id == ndp->channel_count) { ndp->channel_probe_id = 0; nd->state = ncsi_dev_state_suspend_dcnt; } break; case ncsi_dev_state_suspend_dcnt: ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_DCNT; nca.package = np->id; nca.channel = nc->id; nd->state = ncsi_dev_state_suspend_dc; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; break; case ncsi_dev_state_suspend_dc: ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_DC; nca.package = np->id; nca.channel = nc->id; nca.bytes[0] = 1; nd->state = ncsi_dev_state_suspend_deselect; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; NCSI_FOR_EACH_CHANNEL(np, tmp) { /* If there is another channel active on this package * do not deselect the package. */ if (tmp != nc && tmp->state == NCSI_CHANNEL_ACTIVE) { nd->state = ncsi_dev_state_suspend_done; break; } } break; case ncsi_dev_state_suspend_deselect: ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_DP; nca.package = np->id; nca.channel = NCSI_RESERVED_CHANNEL; nd->state = ncsi_dev_state_suspend_done; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; break; case ncsi_dev_state_suspend_done: spin_lock_irqsave(&nc->lock, flags); nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); if (ndp->flags & NCSI_DEV_RESET) ncsi_reset_dev(nd); else ncsi_process_next_channel(ndp); break; default: netdev_warn(nd->dev, "Wrong NCSI state 0x%x in suspend\n", nd->state); } return; error: nd->state = ncsi_dev_state_functional; } /* Check the VLAN filter bitmap for a set filter, and construct a * "Set VLAN Filter - Disable" packet if found. */ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, struct ncsi_cmd_arg *nca) { struct ncsi_channel_vlan_filter *ncf; unsigned long flags; void *bitmap; int index; u16 vid; ncf = &nc->vlan_filter; bitmap = &ncf->bitmap; spin_lock_irqsave(&nc->lock, flags); index = find_first_bit(bitmap, ncf->n_vids); if (index >= ncf->n_vids) { spin_unlock_irqrestore(&nc->lock, flags); return -1; } vid = ncf->vids[index]; clear_bit(index, bitmap); ncf->vids[index] = 0; spin_unlock_irqrestore(&nc->lock, flags); nca->type = NCSI_PKT_CMD_SVF; nca->words[1] = vid; /* HW filter index starts at 1 */ nca->bytes[6] = index + 1; nca->bytes[7] = 0x00; return 0; } /* Find an outstanding VLAN tag and construct a "Set VLAN Filter - Enable" * packet. */ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, struct ncsi_cmd_arg *nca) { struct ncsi_channel_vlan_filter *ncf; struct vlan_vid *vlan = NULL; unsigned long flags; int i, index; void *bitmap; u16 vid; if (list_empty(&ndp->vlan_vids)) return -1; ncf = &nc->vlan_filter; bitmap = &ncf->bitmap; spin_lock_irqsave(&nc->lock, flags); rcu_read_lock(); list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) { vid = vlan->vid; for (i = 0; i < ncf->n_vids; i++) if (ncf->vids[i] == vid) { vid = 0; break; } if (vid) break; } rcu_read_unlock(); if (!vid) { /* No VLAN ID is not set */ spin_unlock_irqrestore(&nc->lock, flags); return -1; } index = find_first_zero_bit(bitmap, ncf->n_vids); if (index < 0 || index >= ncf->n_vids) { netdev_err(ndp->ndev.dev, "Channel %u already has all VLAN filters set\n", nc->id); spin_unlock_irqrestore(&nc->lock, flags); return -1; } ncf->vids[index] = vid; set_bit(index, bitmap); spin_unlock_irqrestore(&nc->lock, flags); nca->type = NCSI_PKT_CMD_SVF; nca->words[1] = vid; /* HW filter index starts at 1 */ nca->bytes[6] = index + 1; nca->bytes[7] = 0x01; return 0; } static int ncsi_oem_keep_phy_intel(struct ncsi_cmd_arg *nca) { unsigned char data[NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN]; int ret = 0; nca->payload = NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN; memset(data, 0, NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN); *(unsigned int *)data = ntohl((__force __be32)NCSI_OEM_MFR_INTEL_ID); data[4] = NCSI_OEM_INTEL_CMD_KEEP_PHY; /* PHY Link up attribute */ data[6] = 0x1; nca->data = data; ret = ncsi_xmit_cmd(nca); if (ret) netdev_err(nca->ndp->ndev.dev, "NCSI: Failed to transmit cmd 0x%x during configure\n", nca->type); return ret; } /* NCSI OEM Command APIs */ static int ncsi_oem_gma_handler_bcm(struct ncsi_cmd_arg *nca) { unsigned char data[NCSI_OEM_BCM_CMD_GMA_LEN]; int ret = 0; nca->payload = NCSI_OEM_BCM_CMD_GMA_LEN; memset(data, 0, NCSI_OEM_BCM_CMD_GMA_LEN); *(unsigned int *)data = ntohl((__force __be32)NCSI_OEM_MFR_BCM_ID); data[5] = NCSI_OEM_BCM_CMD_GMA; nca->data = data; ret = ncsi_xmit_cmd(nca); if (ret) netdev_err(nca->ndp->ndev.dev, "NCSI: Failed to transmit cmd 0x%x during configure\n", nca->type); return ret; } static int ncsi_oem_gma_handler_mlx(struct ncsi_cmd_arg *nca) { union { u8 data_u8[NCSI_OEM_MLX_CMD_GMA_LEN]; u32 data_u32[NCSI_OEM_MLX_CMD_GMA_LEN / sizeof(u32)]; } u; int ret = 0; nca->payload = NCSI_OEM_MLX_CMD_GMA_LEN; memset(&u, 0, sizeof(u)); u.data_u32[0] = ntohl((__force __be32)NCSI_OEM_MFR_MLX_ID); u.data_u8[5] = NCSI_OEM_MLX_CMD_GMA; u.data_u8[6] = NCSI_OEM_MLX_CMD_GMA_PARAM; nca->data = u.data_u8; ret = ncsi_xmit_cmd(nca); if (ret) netdev_err(nca->ndp->ndev.dev, "NCSI: Failed to transmit cmd 0x%x during configure\n", nca->type); return ret; } static int ncsi_oem_smaf_mlx(struct ncsi_cmd_arg *nca) { union { u8 data_u8[NCSI_OEM_MLX_CMD_SMAF_LEN]; u32 data_u32[NCSI_OEM_MLX_CMD_SMAF_LEN / sizeof(u32)]; } u; int ret = 0; memset(&u, 0, sizeof(u)); u.data_u32[0] = ntohl((__force __be32)NCSI_OEM_MFR_MLX_ID); u.data_u8[5] = NCSI_OEM_MLX_CMD_SMAF; u.data_u8[6] = NCSI_OEM_MLX_CMD_SMAF_PARAM; memcpy(&u.data_u8[MLX_SMAF_MAC_ADDR_OFFSET], nca->ndp->ndev.dev->dev_addr, ETH_ALEN); u.data_u8[MLX_SMAF_MED_SUPPORT_OFFSET] = (MLX_MC_RBT_AVL | MLX_MC_RBT_SUPPORT); nca->payload = NCSI_OEM_MLX_CMD_SMAF_LEN; nca->data = u.data_u8; ret = ncsi_xmit_cmd(nca); if (ret) netdev_err(nca->ndp->ndev.dev, "NCSI: Failed to transmit cmd 0x%x during probe\n", nca->type); return ret; } static int ncsi_oem_gma_handler_intel(struct ncsi_cmd_arg *nca) { unsigned char data[NCSI_OEM_INTEL_CMD_GMA_LEN]; int ret = 0; nca->payload = NCSI_OEM_INTEL_CMD_GMA_LEN; memset(data, 0, NCSI_OEM_INTEL_CMD_GMA_LEN); *(unsigned int *)data = ntohl((__force __be32)NCSI_OEM_MFR_INTEL_ID); data[4] = NCSI_OEM_INTEL_CMD_GMA; nca->data = data; ret = ncsi_xmit_cmd(nca); if (ret) netdev_err(nca->ndp->ndev.dev, "NCSI: Failed to transmit cmd 0x%x during configure\n", nca->type); return ret; } /* OEM Command handlers initialization */ static struct ncsi_oem_gma_handler { unsigned int mfr_id; int (*handler)(struct ncsi_cmd_arg *nca); } ncsi_oem_gma_handlers[] = { { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }, { NCSI_OEM_MFR_MLX_ID, ncsi_oem_gma_handler_mlx }, { NCSI_OEM_MFR_INTEL_ID, ncsi_oem_gma_handler_intel } }; static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) { struct ncsi_oem_gma_handler *nch = NULL; int i; /* This function should only be called once, return if flag set */ if (nca->ndp->gma_flag == 1) return -1; /* Find gma handler for given manufacturer id */ for (i = 0; i < ARRAY_SIZE(ncsi_oem_gma_handlers); i++) { if (ncsi_oem_gma_handlers[i].mfr_id == mf_id) { if (ncsi_oem_gma_handlers[i].handler) nch = &ncsi_oem_gma_handlers[i]; break; } } if (!nch) { netdev_err(nca->ndp->ndev.dev, "NCSI: No GMA handler available for MFR-ID (0x%x)\n", mf_id); return -1; } /* Get Mac address from NCSI device */ return nch->handler(nca); } /* Determine if a given channel from the channel_queue should be used for Tx */ static bool ncsi_channel_is_tx(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc) { struct ncsi_channel_mode *ncm; struct ncsi_channel *channel; struct ncsi_package *np; /* Check if any other channel has Tx enabled; a channel may have already * been configured and removed from the channel queue. */ NCSI_FOR_EACH_PACKAGE(ndp, np) { if (!ndp->multi_package && np != nc->package) continue; NCSI_FOR_EACH_CHANNEL(np, channel) { ncm = &channel->modes[NCSI_MODE_TX_ENABLE]; if (ncm->enable) return false; } } /* This channel is the preferred channel and has link */ list_for_each_entry_rcu(channel, &ndp->channel_queue, link) { np = channel->package; if (np->preferred_channel && ncsi_channel_has_link(np->preferred_channel)) { return np->preferred_channel == nc; } } /* This channel has link */ if (ncsi_channel_has_link(nc)) return true; list_for_each_entry_rcu(channel, &ndp->channel_queue, link) if (ncsi_channel_has_link(channel)) return false; /* No other channel has link; default to this one */ return true; } /* Change the active Tx channel in a multi-channel setup */ int ncsi_update_tx_channel(struct ncsi_dev_priv *ndp, struct ncsi_package *package, struct ncsi_channel *disable, struct ncsi_channel *enable) { struct ncsi_cmd_arg nca; struct ncsi_channel *nc; struct ncsi_package *np; int ret = 0; if (!package->multi_channel && !ndp->multi_package) netdev_warn(ndp->ndev.dev, "NCSI: Trying to update Tx channel in single-channel mode\n"); nca.ndp = ndp; nca.req_flags = 0; /* Find current channel with Tx enabled */ NCSI_FOR_EACH_PACKAGE(ndp, np) { if (disable) break; if (!ndp->multi_package && np != package) continue; NCSI_FOR_EACH_CHANNEL(np, nc) if (nc->modes[NCSI_MODE_TX_ENABLE].enable) { disable = nc; break; } } /* Find a suitable channel for Tx */ NCSI_FOR_EACH_PACKAGE(ndp, np) { if (enable) break; if (!ndp->multi_package && np != package) continue; if (!(ndp->package_whitelist & (0x1 << np->id))) continue; if (np->preferred_channel && ncsi_channel_has_link(np->preferred_channel)) { enable = np->preferred_channel; break; } NCSI_FOR_EACH_CHANNEL(np, nc) { if (!(np->channel_whitelist & 0x1 << nc->id)) continue; if (nc->state != NCSI_CHANNEL_ACTIVE) continue; if (ncsi_channel_has_link(nc)) { enable = nc; break; } } } if (disable == enable) return -1; if (!enable) return -1; if (disable) { nca.channel = disable->id; nca.package = disable->package->id; nca.type = NCSI_PKT_CMD_DCNT; ret = ncsi_xmit_cmd(&nca); if (ret) netdev_err(ndp->ndev.dev, "Error %d sending DCNT\n", ret); } netdev_info(ndp->ndev.dev, "NCSI: channel %u enables Tx\n", enable->id); nca.channel = enable->id; nca.package = enable->package->id; nca.type = NCSI_PKT_CMD_ECNT; ret = ncsi_xmit_cmd(&nca); if (ret) netdev_err(ndp->ndev.dev, "Error %d sending ECNT\n", ret); return ret; } static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) { struct ncsi_package *np = ndp->active_package; struct ncsi_channel *nc = ndp->active_channel; struct ncsi_channel *hot_nc = NULL; struct ncsi_dev *nd = &ndp->ndev; struct net_device *dev = nd->dev; struct ncsi_cmd_arg nca; unsigned char index; unsigned long flags; int ret; nca.ndp = ndp; nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN; switch (nd->state) { case ncsi_dev_state_config: case ncsi_dev_state_config_sp: ndp->pending_req_num = 1; /* Select the specific package */ nca.type = NCSI_PKT_CMD_SP; if (ndp->flags & NCSI_DEV_HWA) nca.bytes[0] = 0; else nca.bytes[0] = 1; nca.package = np->id; nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); if (ret) { netdev_err(ndp->ndev.dev, "NCSI: Failed to transmit CMD_SP\n"); goto error; } nd->state = ncsi_dev_state_config_cis; break; case ncsi_dev_state_config_cis: ndp->pending_req_num = 1; /* Clear initial state */ nca.type = NCSI_PKT_CMD_CIS; nca.package = np->id; nca.channel = nc->id; ret = ncsi_xmit_cmd(&nca); if (ret) { netdev_err(ndp->ndev.dev, "NCSI: Failed to transmit CMD_CIS\n"); goto error; } nd->state = IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) ? ncsi_dev_state_config_oem_gma : ncsi_dev_state_config_clear_vids; break; case ncsi_dev_state_config_oem_gma: nd->state = ncsi_dev_state_config_clear_vids; nca.package = np->id; nca.channel = nc->id; ndp->pending_req_num = 1; if (nc->version.major >= 1 && nc->version.minor >= 2) { nca.type = NCSI_PKT_CMD_GMCMA; ret = ncsi_xmit_cmd(&nca); } else { nca.type = NCSI_PKT_CMD_OEM; ret = ncsi_gma_handler(&nca, nc->version.mf_id); } if (ret < 0) schedule_work(&ndp->work); break; case ncsi_dev_state_config_clear_vids: case ncsi_dev_state_config_svf: case ncsi_dev_state_config_ev: case ncsi_dev_state_config_sma: case ncsi_dev_state_config_ebf: case ncsi_dev_state_config_dgmf: case ncsi_dev_state_config_ecnt: case ncsi_dev_state_config_ec: case ncsi_dev_state_config_ae: case ncsi_dev_state_config_gls: ndp->pending_req_num = 1; nca.package = np->id; nca.channel = nc->id; /* Clear any active filters on the channel before setting */ if (nd->state == ncsi_dev_state_config_clear_vids) { ret = clear_one_vid(ndp, nc, &nca); if (ret) { nd->state = ncsi_dev_state_config_svf; schedule_work(&ndp->work); break; } /* Repeat */ nd->state = ncsi_dev_state_config_clear_vids; /* Add known VLAN tags to the filter */ } else if (nd->state == ncsi_dev_state_config_svf) { ret = set_one_vid(ndp, nc, &nca); if (ret) { nd->state = ncsi_dev_state_config_ev; schedule_work(&ndp->work); break; } /* Repeat */ nd->state = ncsi_dev_state_config_svf; /* Enable/Disable the VLAN filter */ } else if (nd->state == ncsi_dev_state_config_ev) { if (list_empty(&ndp->vlan_vids)) { nca.type = NCSI_PKT_CMD_DV; } else { nca.type = NCSI_PKT_CMD_EV; nca.bytes[3] = NCSI_CAP_VLAN_NO; } nd->state = ncsi_dev_state_config_sma; } else if (nd->state == ncsi_dev_state_config_sma) { /* Use first entry in unicast filter table. Note that * the MAC filter table starts from entry 1 instead of * 0. */ nca.type = NCSI_PKT_CMD_SMA; for (index = 0; index < 6; index++) nca.bytes[index] = dev->dev_addr[index]; nca.bytes[6] = 0x1; nca.bytes[7] = 0x1; nd->state = ncsi_dev_state_config_ebf; } else if (nd->state == ncsi_dev_state_config_ebf) { nca.type = NCSI_PKT_CMD_EBF; nca.dwords[0] = nc->caps[NCSI_CAP_BC].cap; /* if multicast global filtering is supported then * disable it so that all multicast packet will be * forwarded to management controller */ if (nc->caps[NCSI_CAP_GENERIC].cap & NCSI_CAP_GENERIC_MC) nd->state = ncsi_dev_state_config_dgmf; else if (ncsi_channel_is_tx(ndp, nc)) nd->state = ncsi_dev_state_config_ecnt; else nd->state = ncsi_dev_state_config_ec; } else if (nd->state == ncsi_dev_state_config_dgmf) { nca.type = NCSI_PKT_CMD_DGMF; if (ncsi_channel_is_tx(ndp, nc)) nd->state = ncsi_dev_state_config_ecnt; else nd->state = ncsi_dev_state_config_ec; } else if (nd->state == ncsi_dev_state_config_ecnt) { if (np->preferred_channel && nc != np->preferred_channel) netdev_info(ndp->ndev.dev, "NCSI: Tx failed over to channel %u\n", nc->id); nca.type = NCSI_PKT_CMD_ECNT; nd->state = ncsi_dev_state_config_ec; } else if (nd->state == ncsi_dev_state_config_ec) { /* Enable AEN if it's supported */ nca.type = NCSI_PKT_CMD_EC; nd->state = ncsi_dev_state_config_ae; if (!(nc->caps[NCSI_CAP_AEN].cap & NCSI_CAP_AEN_MASK)) nd->state = ncsi_dev_state_config_gls; } else if (nd->state == ncsi_dev_state_config_ae) { nca.type = NCSI_PKT_CMD_AE; nca.bytes[0] = 0; nca.dwords[1] = nc->caps[NCSI_CAP_AEN].cap; nd->state = ncsi_dev_state_config_gls; } else if (nd->state == ncsi_dev_state_config_gls) { nca.type = NCSI_PKT_CMD_GLS; nd->state = ncsi_dev_state_config_done; } ret = ncsi_xmit_cmd(&nca); if (ret) { netdev_err(ndp->ndev.dev, "NCSI: Failed to transmit CMD %x\n", nca.type); goto error; } break; case ncsi_dev_state_config_done: netdev_dbg(ndp->ndev.dev, "NCSI: channel %u config done\n", nc->id); spin_lock_irqsave(&nc->lock, flags); nc->state = NCSI_CHANNEL_ACTIVE; if (ndp->flags & NCSI_DEV_RESET) { /* A reset event happened during config, start it now */ nc->reconfigure_needed = false; spin_unlock_irqrestore(&nc->lock, flags); ncsi_reset_dev(nd); break; } if (nc->reconfigure_needed) { /* This channel's configuration has been updated * part-way during the config state - start the * channel configuration over */ nc->reconfigure_needed = false; nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); spin_lock_irqsave(&ndp->lock, flags); list_add_tail_rcu(&nc->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); netdev_dbg(dev, "Dirty NCSI channel state reset\n"); ncsi_process_next_channel(ndp); break; } if (nc->modes[NCSI_MODE_LINK].data[2] & 0x1) { hot_nc = nc; } else { hot_nc = NULL; netdev_dbg(ndp->ndev.dev, "NCSI: channel %u link down after config\n", nc->id); } spin_unlock_irqrestore(&nc->lock, flags); /* Update the hot channel */ spin_lock_irqsave(&ndp->lock, flags); ndp->hot_channel = hot_nc; spin_unlock_irqrestore(&ndp->lock, flags); ncsi_start_channel_monitor(nc); ncsi_process_next_channel(ndp); break; default: netdev_alert(dev, "Wrong NCSI state 0x%x in config\n", nd->state); } return; error: ncsi_report_link(ndp, true); } static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) { struct ncsi_channel *nc, *found, *hot_nc; struct ncsi_channel_mode *ncm; unsigned long flags, cflags; struct ncsi_package *np; bool with_link; spin_lock_irqsave(&ndp->lock, flags); hot_nc = ndp->hot_channel; spin_unlock_irqrestore(&ndp->lock, flags); /* By default the search is done once an inactive channel with up * link is found, unless a preferred channel is set. * If multi_package or multi_channel are configured all channels in the * whitelist are added to the channel queue. */ found = NULL; with_link = false; NCSI_FOR_EACH_PACKAGE(ndp, np) { if (!(ndp->package_whitelist & (0x1 << np->id))) continue; NCSI_FOR_EACH_CHANNEL(np, nc) { if (!(np->channel_whitelist & (0x1 << nc->id))) continue; spin_lock_irqsave(&nc->lock, cflags); if (!list_empty(&nc->link) || nc->state != NCSI_CHANNEL_INACTIVE) { spin_unlock_irqrestore(&nc->lock, cflags); continue; } if (!found) found = nc; if (nc == hot_nc) found = nc; ncm = &nc->modes[NCSI_MODE_LINK]; if (ncm->data[2] & 0x1) { found = nc; with_link = true; } /* If multi_channel is enabled configure all valid * channels whether or not they currently have link * so they will have AENs enabled. */ if (with_link || np->multi_channel) { spin_lock_irqsave(&ndp->lock, flags); list_add_tail_rcu(&nc->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); netdev_dbg(ndp->ndev.dev, "NCSI: Channel %u added to queue (link %s)\n", nc->id, ncm->data[2] & 0x1 ? "up" : "down"); } spin_unlock_irqrestore(&nc->lock, cflags); if (with_link && !np->multi_channel) break; } if (with_link && !ndp->multi_package) break; } if (list_empty(&ndp->channel_queue) && found) { netdev_info(ndp->ndev.dev, "NCSI: No channel with link found, configuring channel %u\n", found->id); spin_lock_irqsave(&ndp->lock, flags); list_add_tail_rcu(&found->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); } else if (!found) { netdev_warn(ndp->ndev.dev, "NCSI: No channel found to configure!\n"); ncsi_report_link(ndp, true); return -ENODEV; } return ncsi_process_next_channel(ndp); } static bool ncsi_check_hwa(struct ncsi_dev_priv *ndp) { struct ncsi_package *np; struct ncsi_channel *nc; unsigned int cap; bool has_channel = false; /* The hardware arbitration is disabled if any one channel * doesn't support explicitly. */ NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { has_channel = true; cap = nc->caps[NCSI_CAP_GENERIC].cap; if (!(cap & NCSI_CAP_GENERIC_HWA) || (cap & NCSI_CAP_GENERIC_HWA_MASK) != NCSI_CAP_GENERIC_HWA_SUPPORT) { ndp->flags &= ~NCSI_DEV_HWA; return false; } } } if (has_channel) { ndp->flags |= NCSI_DEV_HWA; return true; } ndp->flags &= ~NCSI_DEV_HWA; return false; } static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; struct ncsi_package *np; struct ncsi_cmd_arg nca; unsigned char index; int ret; nca.ndp = ndp; nca.req_flags = NCSI_REQ_FLAG_EVENT_DRIVEN; switch (nd->state) { case ncsi_dev_state_probe: nd->state = ncsi_dev_state_probe_deselect; fallthrough; case ncsi_dev_state_probe_deselect: ndp->pending_req_num = 8; /* Deselect all possible packages */ nca.type = NCSI_PKT_CMD_DP; nca.channel = NCSI_RESERVED_CHANNEL; for (index = 0; index < 8; index++) { nca.package = index; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; } nd->state = ncsi_dev_state_probe_package; break; case ncsi_dev_state_probe_package: ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_SP; nca.bytes[0] = 1; nca.package = ndp->package_probe_id; nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; nd->state = ncsi_dev_state_probe_channel; break; case ncsi_dev_state_probe_channel: ndp->active_package = ncsi_find_package(ndp, ndp->package_probe_id); if (!ndp->active_package) { /* No response */ nd->state = ncsi_dev_state_probe_dp; schedule_work(&ndp->work); break; } nd->state = ncsi_dev_state_probe_cis; if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_GET_MAC) && ndp->mlx_multi_host) nd->state = ncsi_dev_state_probe_mlx_gma; schedule_work(&ndp->work); break; case ncsi_dev_state_probe_mlx_gma: ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_OEM; nca.package = ndp->active_package->id; nca.channel = 0; ret = ncsi_oem_gma_handler_mlx(&nca); if (ret) goto error; nd->state = ncsi_dev_state_probe_mlx_smaf; break; case ncsi_dev_state_probe_mlx_smaf: ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_OEM; nca.package = ndp->active_package->id; nca.channel = 0; ret = ncsi_oem_smaf_mlx(&nca); if (ret) goto error; nd->state = ncsi_dev_state_probe_cis; break; case ncsi_dev_state_probe_keep_phy: ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_OEM; nca.package = ndp->active_package->id; nca.channel = 0; ret = ncsi_oem_keep_phy_intel(&nca); if (ret) goto error; nd->state = ncsi_dev_state_probe_gvi; break; case ncsi_dev_state_probe_cis: case ncsi_dev_state_probe_gvi: case ncsi_dev_state_probe_gc: case ncsi_dev_state_probe_gls: np = ndp->active_package; ndp->pending_req_num = 1; /* Clear initial state Retrieve version, capability or link status */ if (nd->state == ncsi_dev_state_probe_cis) nca.type = NCSI_PKT_CMD_CIS; else if (nd->state == ncsi_dev_state_probe_gvi) nca.type = NCSI_PKT_CMD_GVI; else if (nd->state == ncsi_dev_state_probe_gc) nca.type = NCSI_PKT_CMD_GC; else nca.type = NCSI_PKT_CMD_GLS; nca.package = np->id; nca.channel = ndp->channel_probe_id; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; if (nd->state == ncsi_dev_state_probe_cis) { nd->state = ncsi_dev_state_probe_gvi; if (IS_ENABLED(CONFIG_NCSI_OEM_CMD_KEEP_PHY) && ndp->channel_probe_id == 0) nd->state = ncsi_dev_state_probe_keep_phy; } else if (nd->state == ncsi_dev_state_probe_gvi) { nd->state = ncsi_dev_state_probe_gc; } else if (nd->state == ncsi_dev_state_probe_gc) { nd->state = ncsi_dev_state_probe_gls; } else { nd->state = ncsi_dev_state_probe_cis; ndp->channel_probe_id++; } if (ndp->channel_probe_id == ndp->channel_count) { ndp->channel_probe_id = 0; nd->state = ncsi_dev_state_probe_dp; } break; case ncsi_dev_state_probe_dp: ndp->pending_req_num = 1; /* Deselect the current package */ nca.type = NCSI_PKT_CMD_DP; nca.package = ndp->package_probe_id; nca.channel = NCSI_RESERVED_CHANNEL; ret = ncsi_xmit_cmd(&nca); if (ret) goto error; /* Probe next package */ ndp->package_probe_id++; if (ndp->package_probe_id >= 8) { /* Probe finished */ ndp->flags |= NCSI_DEV_PROBED; break; } nd->state = ncsi_dev_state_probe_package; ndp->active_package = NULL; break; default: netdev_warn(nd->dev, "Wrong NCSI state 0x%0x in enumeration\n", nd->state); } if (ndp->flags & NCSI_DEV_PROBED) { /* Check if all packages have HWA support */ ncsi_check_hwa(ndp); ncsi_choose_active_channel(ndp); } return; error: netdev_err(ndp->ndev.dev, "NCSI: Failed to transmit cmd 0x%x during probe\n", nca.type); ncsi_report_link(ndp, true); } static void ncsi_dev_work(struct work_struct *work) { struct ncsi_dev_priv *ndp = container_of(work, struct ncsi_dev_priv, work); struct ncsi_dev *nd = &ndp->ndev; switch (nd->state & ncsi_dev_state_major) { case ncsi_dev_state_probe: ncsi_probe_channel(ndp); break; case ncsi_dev_state_suspend: ncsi_suspend_channel(ndp); break; case ncsi_dev_state_config: ncsi_configure_channel(ndp); break; default: netdev_warn(nd->dev, "Wrong NCSI state 0x%x in workqueue\n", nd->state); } } int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) { struct ncsi_channel *nc; int old_state; unsigned long flags; spin_lock_irqsave(&ndp->lock, flags); nc = list_first_or_null_rcu(&ndp->channel_queue, struct ncsi_channel, link); if (!nc) { spin_unlock_irqrestore(&ndp->lock, flags); goto out; } list_del_init(&nc->link); spin_unlock_irqrestore(&ndp->lock, flags); spin_lock_irqsave(&nc->lock, flags); old_state = nc->state; nc->state = NCSI_CHANNEL_INVISIBLE; spin_unlock_irqrestore(&nc->lock, flags); ndp->active_channel = nc; ndp->active_package = nc->package; switch (old_state) { case NCSI_CHANNEL_INACTIVE: ndp->ndev.state = ncsi_dev_state_config; netdev_dbg(ndp->ndev.dev, "NCSI: configuring channel %u\n", nc->id); ncsi_configure_channel(ndp); break; case NCSI_CHANNEL_ACTIVE: ndp->ndev.state = ncsi_dev_state_suspend; netdev_dbg(ndp->ndev.dev, "NCSI: suspending channel %u\n", nc->id); ncsi_suspend_channel(ndp); break; default: netdev_err(ndp->ndev.dev, "Invalid state 0x%x on %d:%d\n", old_state, nc->package->id, nc->id); ncsi_report_link(ndp, false); return -EINVAL; } return 0; out: ndp->active_channel = NULL; ndp->active_package = NULL; if (ndp->flags & NCSI_DEV_RESHUFFLE) { ndp->flags &= ~NCSI_DEV_RESHUFFLE; return ncsi_choose_active_channel(ndp); } ncsi_report_link(ndp, false); return -ENODEV; } static int ncsi_kick_channels(struct ncsi_dev_priv *ndp) { struct ncsi_dev *nd = &ndp->ndev; struct ncsi_channel *nc; struct ncsi_package *np; unsigned long flags; unsigned int n = 0; NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { spin_lock_irqsave(&nc->lock, flags); /* Channels may be busy, mark dirty instead of * kicking if; * a) not ACTIVE (configured) * b) in the channel_queue (to be configured) * c) it's ndev is in the config state */ if (nc->state != NCSI_CHANNEL_ACTIVE) { if ((ndp->ndev.state & 0xff00) == ncsi_dev_state_config || !list_empty(&nc->link)) { netdev_dbg(nd->dev, "NCSI: channel %p marked dirty\n", nc); nc->reconfigure_needed = true; } spin_unlock_irqrestore(&nc->lock, flags); continue; } spin_unlock_irqrestore(&nc->lock, flags); ncsi_stop_channel_monitor(nc); spin_lock_irqsave(&nc->lock, flags); nc->state = NCSI_CHANNEL_INACTIVE; spin_unlock_irqrestore(&nc->lock, flags); spin_lock_irqsave(&ndp->lock, flags); list_add_tail_rcu(&nc->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); netdev_dbg(nd->dev, "NCSI: kicked channel %p\n", nc); n++; } } return n; } int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct ncsi_dev_priv *ndp; unsigned int n_vids = 0; struct vlan_vid *vlan; struct ncsi_dev *nd; bool found = false; if (vid == 0) return 0; nd = ncsi_find_dev(dev); if (!nd) { netdev_warn(dev, "NCSI: No net_device?\n"); return 0; } ndp = TO_NCSI_DEV_PRIV(nd); /* Add the VLAN id to our internal list */ list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) { n_vids++; if (vlan->vid == vid) { netdev_dbg(dev, "NCSI: vid %u already registered\n", vid); return 0; } } if (n_vids >= NCSI_MAX_VLAN_VIDS) { netdev_warn(dev, "tried to add vlan id %u but NCSI max already registered (%u)\n", vid, NCSI_MAX_VLAN_VIDS); return -ENOSPC; } vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) return -ENOMEM; vlan->proto = proto; vlan->vid = vid; list_add_rcu(&vlan->list, &ndp->vlan_vids); netdev_dbg(dev, "NCSI: Added new vid %u\n", vid); found = ncsi_kick_channels(ndp) != 0; return found ? ncsi_process_next_channel(ndp) : 0; } EXPORT_SYMBOL_GPL(ncsi_vlan_rx_add_vid); int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct vlan_vid *vlan, *tmp; struct ncsi_dev_priv *ndp; struct ncsi_dev *nd; bool found = false; if (vid == 0) return 0; nd = ncsi_find_dev(dev); if (!nd) { netdev_warn(dev, "NCSI: no net_device?\n"); return 0; } ndp = TO_NCSI_DEV_PRIV(nd); /* Remove the VLAN id from our internal list */ list_for_each_entry_safe(vlan, tmp, &ndp->vlan_vids, list) if (vlan->vid == vid) { netdev_dbg(dev, "NCSI: vid %u found, removing\n", vid); list_del_rcu(&vlan->list); found = true; kfree(vlan); } if (!found) { netdev_err(dev, "NCSI: vid %u wasn't registered!\n", vid); return -EINVAL; } found = ncsi_kick_channels(ndp) != 0; return found ? ncsi_process_next_channel(ndp) : 0; } EXPORT_SYMBOL_GPL(ncsi_vlan_rx_kill_vid); struct ncsi_dev *ncsi_register_dev(struct net_device *dev, void (*handler)(struct ncsi_dev *ndev)) { struct ncsi_dev_priv *ndp; struct ncsi_dev *nd; struct platform_device *pdev; struct device_node *np; unsigned long flags; int i; /* Check if the device has been registered or not */ nd = ncsi_find_dev(dev); if (nd) return nd; /* Create NCSI device */ ndp = kzalloc(sizeof(*ndp), GFP_ATOMIC); if (!ndp) return NULL; nd = &ndp->ndev; nd->state = ncsi_dev_state_registered; nd->dev = dev; nd->handler = handler; ndp->pending_req_num = 0; INIT_LIST_HEAD(&ndp->channel_queue); INIT_LIST_HEAD(&ndp->vlan_vids); INIT_WORK(&ndp->work, ncsi_dev_work); ndp->package_whitelist = UINT_MAX; /* Initialize private NCSI device */ spin_lock_init(&ndp->lock); INIT_LIST_HEAD(&ndp->packages); ndp->request_id = NCSI_REQ_START_IDX; for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) { ndp->requests[i].id = i; ndp->requests[i].ndp = ndp; timer_setup(&ndp->requests[i].timer, ncsi_request_timeout, 0); } ndp->channel_count = NCSI_RESERVED_CHANNEL; spin_lock_irqsave(&ncsi_dev_lock, flags); list_add_tail_rcu(&ndp->node, &ncsi_dev_list); spin_unlock_irqrestore(&ncsi_dev_lock, flags); /* Register NCSI packet Rx handler */ ndp->ptype.type = cpu_to_be16(ETH_P_NCSI); ndp->ptype.func = ncsi_rcv_rsp; ndp->ptype.dev = dev; dev_add_pack(&ndp->ptype); pdev = to_platform_device(dev->dev.parent); if (pdev) { np = pdev->dev.of_node; if (np && (of_property_read_bool(np, "mellanox,multi-host") || of_property_read_bool(np, "mlx,multi-host"))) ndp->mlx_multi_host = true; } return nd; } EXPORT_SYMBOL_GPL(ncsi_register_dev); int ncsi_start_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); if (nd->state != ncsi_dev_state_registered && nd->state != ncsi_dev_state_functional) return -ENOTTY; if (!(ndp->flags & NCSI_DEV_PROBED)) { ndp->package_probe_id = 0; ndp->channel_probe_id = 0; nd->state = ncsi_dev_state_probe; schedule_work(&ndp->work); return 0; } return ncsi_reset_dev(nd); } EXPORT_SYMBOL_GPL(ncsi_start_dev); void ncsi_stop_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); struct ncsi_package *np; struct ncsi_channel *nc; bool chained; int old_state; unsigned long flags; /* Stop the channel monitor on any active channels. Don't reset the * channel state so we know which were active when ncsi_start_dev() * is next called. */ NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { ncsi_stop_channel_monitor(nc); spin_lock_irqsave(&nc->lock, flags); chained = !list_empty(&nc->link); old_state = nc->state; spin_unlock_irqrestore(&nc->lock, flags); WARN_ON_ONCE(chained || old_state == NCSI_CHANNEL_INVISIBLE); } } netdev_dbg(ndp->ndev.dev, "NCSI: Stopping device\n"); ncsi_report_link(ndp, true); } EXPORT_SYMBOL_GPL(ncsi_stop_dev); int ncsi_reset_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); struct ncsi_channel *nc, *active, *tmp; struct ncsi_package *np; unsigned long flags; spin_lock_irqsave(&ndp->lock, flags); if (!(ndp->flags & NCSI_DEV_RESET)) { /* Haven't been called yet, check states */ switch (nd->state & ncsi_dev_state_major) { case ncsi_dev_state_registered: case ncsi_dev_state_probe: /* Not even probed yet - do nothing */ spin_unlock_irqrestore(&ndp->lock, flags); return 0; case ncsi_dev_state_suspend: case ncsi_dev_state_config: /* Wait for the channel to finish its suspend/config * operation; once it finishes it will check for * NCSI_DEV_RESET and reset the state. */ ndp->flags |= NCSI_DEV_RESET; spin_unlock_irqrestore(&ndp->lock, flags); return 0; } } else { switch (nd->state) { case ncsi_dev_state_suspend_done: case ncsi_dev_state_config_done: case ncsi_dev_state_functional: /* Ok */ break; default: /* Current reset operation happening */ spin_unlock_irqrestore(&ndp->lock, flags); return 0; } } if (!list_empty(&ndp->channel_queue)) { /* Clear any channel queue we may have interrupted */ list_for_each_entry_safe(nc, tmp, &ndp->channel_queue, link) list_del_init(&nc->link); } spin_unlock_irqrestore(&ndp->lock, flags); active = NULL; NCSI_FOR_EACH_PACKAGE(ndp, np) { NCSI_FOR_EACH_CHANNEL(np, nc) { spin_lock_irqsave(&nc->lock, flags); if (nc->state == NCSI_CHANNEL_ACTIVE) { active = nc; nc->state = NCSI_CHANNEL_INVISIBLE; spin_unlock_irqrestore(&nc->lock, flags); ncsi_stop_channel_monitor(nc); break; } spin_unlock_irqrestore(&nc->lock, flags); } if (active) break; } if (!active) { /* Done */ spin_lock_irqsave(&ndp->lock, flags); ndp->flags &= ~NCSI_DEV_RESET; spin_unlock_irqrestore(&ndp->lock, flags); return ncsi_choose_active_channel(ndp); } spin_lock_irqsave(&ndp->lock, flags); ndp->flags |= NCSI_DEV_RESET; ndp->active_channel = active; ndp->active_package = active->package; spin_unlock_irqrestore(&ndp->lock, flags); nd->state = ncsi_dev_state_suspend; schedule_work(&ndp->work); return 0; } void ncsi_unregister_dev(struct ncsi_dev *nd) { struct ncsi_dev_priv *ndp = TO_NCSI_DEV_PRIV(nd); struct ncsi_package *np, *tmp; unsigned long flags; dev_remove_pack(&ndp->ptype); list_for_each_entry_safe(np, tmp, &ndp->packages, node) ncsi_remove_package(np); spin_lock_irqsave(&ncsi_dev_lock, flags); list_del_rcu(&ndp->node); spin_unlock_irqrestore(&ncsi_dev_lock, flags); disable_work_sync(&ndp->work); kfree(ndp); } EXPORT_SYMBOL_GPL(ncsi_unregister_dev); |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NFNETLINK_H #define _NFNETLINK_H #include <linux/netlink.h> #include <linux/capability.h> #include <net/netlink.h> #include <uapi/linux/netfilter/nfnetlink.h> struct nfnl_info { struct net *net; struct sock *sk; const struct nlmsghdr *nlh; const struct nfgenmsg *nfmsg; struct netlink_ext_ack *extack; }; enum nfnl_callback_type { NFNL_CB_UNSPEC = 0, NFNL_CB_MUTEX, NFNL_CB_RCU, NFNL_CB_BATCH, }; struct nfnl_callback { int (*call)(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]); const struct nla_policy *policy; enum nfnl_callback_type type; __u16 attr_count; }; enum nfnl_abort_action { NFNL_ABORT_NONE = 0, NFNL_ABORT_AUTOLOAD, NFNL_ABORT_VALIDATE, }; struct nfnetlink_subsystem { const char *name; __u8 subsys_id; /* nfnetlink subsystem ID */ __u8 cb_count; /* number of callbacks */ const struct nfnl_callback *cb; /* callback for individual types */ struct module *owner; int (*commit)(struct net *net, struct sk_buff *skb); int (*abort)(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action); bool (*valid_genid)(struct net *net, u32 genid); }; int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); int nfnetlink_has_listeners(struct net *net, unsigned int group); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid); void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type) { return subsys << 8 | msg_type; } static inline void nfnl_fill_hdr(struct nlmsghdr *nlh, u8 family, u8 version, __be16 res_id) { struct nfgenmsg *nfmsg; nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = version; nfmsg->res_id = res_id; } static inline struct nlmsghdr *nfnl_msg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int flags, u8 family, u8 version, __be16 res_id) { struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags); if (!nlh) return NULL; nfnl_fill_hdr(nlh, family, version, res_id); return nlh; } void nfnl_lock(__u8 subsys_id); void nfnl_unlock(__u8 subsys_id); #ifdef CONFIG_PROVE_LOCKING bool lockdep_nfnl_is_held(__u8 subsys_id); #else static inline bool lockdep_nfnl_is_held(__u8 subsys_id) { return true; } #endif /* CONFIG_PROVE_LOCKING */ #define MODULE_ALIAS_NFNL_SUBSYS(subsys) \ MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys)) #endif /* _NFNETLINK_H */ |
1894 1901 1901 1270 3 3 3 1 35 426 44 408 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 | /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #ifndef _LINUX_BPF_VERIFIER_H #define _LINUX_BPF_VERIFIER_H 1 #include <linux/bpf.h> /* for enum bpf_reg_type */ #include <linux/btf.h> /* for struct btf and btf_id() */ #include <linux/filter.h> /* for MAX_BPF_STACK */ #include <linux/tnum.h> /* Maximum variable offset umax_value permitted when resolving memory accesses. * In practice this is far bigger than any realistic pointer offset; this limit * ensures that umax_value + (int)off + (int)size cannot overflow a u64. */ #define BPF_MAX_VAR_OFF (1 << 29) /* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO]. This ensures * that converting umax_value to int cannot overflow. */ #define BPF_MAX_VAR_SIZ (1 << 29) /* size of tmp_str_buf in bpf_verifier. * we need at least 306 bytes to fit full stack mask representation * (in the "-8,-16,...,-512" form) */ #define TMP_STR_BUF_LEN 320 /* Patch buffer size */ #define INSN_BUF_SIZE 32 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that * "one of this state's descendants read this reg" (and therefore the reg is * relevant for states_equal() checks). * Write marks collect downwards and do not propagate; they record that "the * straight-line code that reached this state (from its parent) wrote this reg" * (and therefore that reads propagated from this state or its descendants * should not propagate to its parent). * A state with a write mark can receive read marks; it just won't propagate * them to its parent, since the write mark is a property, not of the state, * but of the link between it and its parent. See mark_reg_read() and * mark_stack_slot_read() in kernel/bpf/verifier.c. */ enum bpf_reg_liveness { REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; /* For every reg representing a map value or allocated object pointer, * we consider the tuple of (ptr, id) for them to be unique in verifier * context and conside them to not alias each other for the purposes of * tracking lock state. */ struct bpf_active_lock { /* This can either be reg->map_ptr or reg->btf. If ptr is NULL, * there's no active lock held, and other fields have no * meaning. If non-NULL, it indicates that a lock is held and * id member has the reg->id of the register which can be >= 0. */ void *ptr; /* This will be reg->id */ u32 id; }; #define ITER_PREFIX "bpf_iter_" enum bpf_iter_state { BPF_ITER_STATE_INVALID, /* for non-first slot */ BPF_ITER_STATE_ACTIVE, BPF_ITER_STATE_DRAINED, }; struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; /* * Fixed part of pointer offset, pointer types only. * Or constant delta between "linked" scalars with the same ID. */ s32 off; union { /* valid when type == PTR_TO_PACKET */ int range; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL */ struct { struct bpf_map *map_ptr; /* To distinguish map lookups from outer map * the map_uid is non-zero for registers * pointing to inner maps. */ u32 map_uid; }; /* for PTR_TO_BTF_ID */ struct { struct btf *btf; u32 btf_id; }; struct { /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ u32 mem_size; u32 dynptr_id; /* for dynptr slices */ }; /* For dynptr stack slots */ struct { enum bpf_dynptr_type type; /* A dynptr is 16 bytes so it takes up 2 stack slots. * We need to track which slot is the first slot * to protect against cases where the user may try to * pass in an address starting at the second slot of the * dynptr. */ bool first_slot; } dynptr; /* For bpf_iter stack slots */ struct { /* BTF container and BTF type ID describing * struct bpf_iter_<type> of an iterator state */ struct btf *btf; u32 btf_id; /* packing following two fields to fit iter state into 16 bytes */ enum bpf_iter_state state:2; int depth:30; } iter; /* Max size from any of the above. */ struct { unsigned long raw1; unsigned long raw2; } raw; u32 subprogno; /* for PTR_TO_FUNC */ }; /* For scalar types (SCALAR_VALUE), this represents our knowledge of * the actual value. * For pointer types, t |